From 8b644f19cacb59fe17d1876436a60f3d02839b8c Mon Sep 17 00:00:00 2001
From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Fri, 3 Nov 2017 02:33:01 +0100
Subject: [PATCH] Separate management of GPU contexts from modules

Tasks from modules might share GPU contexts across either tasks, or
thread-MPI ranks, so init and free operations can't be the
responsibility of the modules themselves.

Simplified the error reporting for init and free. Knowledge of the
rank ID might help in diagnosing issues in some cases, but should
(later) be the responsibility of a proper framework to catch errors
during initialization across MPI ranks.

Moved the GPU profiling cleanup back to where it was intended to be,
before some earlier refactoring had left it somewhere not-quite-right.

Change-Id: I682a1b1c7058cbebb41805dba05e688cbee18c2a
---
 src/gromacs/ewald/pme-gpu-internal.cpp      | 11 +--
 src/gromacs/ewald/pme-gpu-internal.h        |  5 +-
 src/gromacs/ewald/pme.cpp                   |  4 +-
 src/gromacs/ewald/tests/pmetestcommon.cpp   |  5 ++
 src/gromacs/gpu_utils/gpu_utils.cu          | 29 +++----
 src/gromacs/gpu_utils/gpu_utils.h           |  9 +--
 src/gromacs/gpu_utils/gpu_utils_ocl.cpp     |  1 -
 src/gromacs/mdlib/force.h                   |  3 +-
 src/gromacs/mdlib/forcerec.cpp              | 87 ++++++++-------------
 src/gromacs/mdlib/forcerec.h                | 28 +++----
 src/gromacs/taskassignment/taskassignment.h |  7 ++
 src/programs/mdrun/runner.cpp               | 38 ++++++---
 12 files changed, 108 insertions(+), 119 deletions(-)

diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp
index 6e1d9992be..9487fd0c5d 100644
--- a/src/gromacs/ewald/pme-gpu-internal.cpp
+++ b/src/gromacs/ewald/pme-gpu-internal.cpp
@@ -282,11 +282,8 @@ static bool pme_gpu_check_restrictions(const gmx_pme_t *pme, std::string *error)
  *
  * \param[in,out] pme       The PME structure.
  * \param[in,out] gpuInfo   The GPU information structure.
- * \param[in]     mdlog     The logger.
- * \param[in]     cr        The communication structure.
  */
-static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog,
-                         const t_commrec *cr)
+static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
 {
     std::string errorString;
     bool        canRunOnGpu = pme_gpu_check_restrictions(pme, &errorString);
@@ -308,8 +305,6 @@ static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::
 
     pme_gpu_set_testing(pmeGPU, false);
 
-    // GPU initialization
-    init_gpu(mdlog, cr->nodeid, gpuInfo);
     pmeGPU->deviceInfo = gpuInfo;
 
     pme_gpu_init_internal(pmeGPU);
@@ -395,7 +390,7 @@ void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGPU, gmx::IVec *gridSize, gmx:
     }
 }
 
-void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr)
+void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
 {
     if (!pme_gpu_active(pme))
     {
@@ -405,7 +400,7 @@ void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLog
     if (!pme->gpu)
     {
         /* First-time initialization */
-        pme_gpu_init(pme, gpuInfo, mdlog, cr);
+        pme_gpu_init(pme, gpuInfo);
     }
     else
     {
diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h
index be50ead6af..33c83103a8 100644
--- a/src/gromacs/ewald/pme-gpu-internal.h
+++ b/src/gromacs/ewald/pme-gpu-internal.h
@@ -55,7 +55,6 @@
 struct gmx_hw_info_t;
 struct gmx_gpu_opt_t;
 struct gmx_pme_t;                              // only used in pme_gpu_reinit
-struct t_commrec;
 struct gmx_wallclock_gpu_pme_t;
 struct pme_atomcomm_t;
 struct t_complex;
@@ -636,11 +635,9 @@ void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGPU, gmx::IVec *gridSize, gmx:
  *
  * \param[in,out] pme       The PME structure.
  * \param[in,out] gpuInfo   The GPU information structure.
- * \param[in]     mdlog     The logger.
- * \param[in]     cr        The communication structure.
  * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
  */
-void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr);
+void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo);
 
 /*! \libinternal \brief
  * Destroys the PME GPU data at the end of the run.
diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp
index 344d8553cc..c90254b0a3 100644
--- a/src/gromacs/ewald/pme.cpp
+++ b/src/gromacs/ewald/pme.cpp
@@ -520,7 +520,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
                         PmeRunMode           runMode,
                         PmeGpu              *pmeGPU,
                         gmx_device_info_t   *gpuInfo,
-                        const gmx::MDLogger &mdlog)
+                        const gmx::MDLogger  & /*mdlog*/)
 {
     int               use_threads, sum_use_threads, i;
     ivec              ndata;
@@ -853,7 +853,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
     pme->lb_buf2       = nullptr;
     pme->lb_buf_nalloc = 0;
 
-    pme_gpu_reinit(pme.get(), gpuInfo, mdlog, cr);
+    pme_gpu_reinit(pme.get(), gpuInfo);
 
     pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
 
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp
index f7c7b0f141..f9ea663ce6 100644
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -52,6 +52,7 @@
 #include "gromacs/ewald/pme-solve.h"
 #include "gromacs/ewald/pme-spread.h"
 #include "gromacs/fft/parallel_3dfft.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/math/invertmatrix.h"
 #include "gromacs/mdtypes/commrec.h"
 #include "gromacs/pbcutil/pbc.h"
@@ -103,6 +104,10 @@ static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
                                       )
 {
     const MDLogger dummyLogger;
+    if (gpuInfo)
+    {
+        init_gpu(dummyLogger, gpuInfo);
+    }
     const auto     runMode       = (mode == CodePath::CPU) ? PmeRunMode::CPU : PmeRunMode::GPU;
     t_commrec      dummyCommrec  = {0};
     gmx_pme_t     *pmeDataRaw    = gmx_pme_init(&dummyCommrec, 1, 1, inputRec, atomCount, false, false, true,
diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu
index fb509f5d37..4e22007940 100644
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -55,11 +55,13 @@
 #include "gromacs/hardware/gpu_hw_info.h"
 #include "gromacs/utility/basedefinitions.h"
 #include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/logger.h"
 #include "gromacs/utility/programcontext.h"
 #include "gromacs/utility/smalloc.h"
 #include "gromacs/utility/snprintf.h"
+#include "gromacs/utility/stringutil.h"
 
 #if HAVE_NVML
 #include <nvml.h>
@@ -507,20 +509,18 @@ static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused
 #endif /* HAVE_NVML_APPLICATION_CLOCKS */
 }
 
-void init_gpu(const gmx::MDLogger &mdlog, int rank,
-              gmx_device_info_t *deviceInfo)
+void init_gpu(const gmx::MDLogger &mdlog,
+              gmx_device_info_t   *deviceInfo)
 {
     cudaError_t stat;
-    char        sbuf[STRLEN];
 
     assert(deviceInfo);
 
     stat = cudaSetDevice(deviceInfo->id);
     if (stat != cudaSuccess)
     {
-        snprintf(sbuf, STRLEN, "On rank %d failed to initialize GPU #%d",
-                 rank, deviceInfo->id);
-        CU_RET_ERR(stat, sbuf);
+        auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
+        CU_RET_ERR(stat, message.c_str());
     }
 
     if (debug)
@@ -534,13 +534,9 @@ void init_gpu(const gmx::MDLogger &mdlog, int rank,
     init_gpu_application_clocks(mdlog, deviceInfo);
 }
 
-gmx_bool free_cuda_gpu(const gmx_device_info_t *deviceInfo,
-                       char                    *result_str)
+void free_gpu(const gmx_device_info_t *deviceInfo)
 {
     cudaError_t  stat;
-    gmx_bool     reset_gpu_application_clocks_status = true;
-
-    assert(result_str);
 
     if (debug)
     {
@@ -552,12 +548,17 @@ gmx_bool free_cuda_gpu(const gmx_device_info_t *deviceInfo,
 
     if (deviceInfo != nullptr)
     {
-        reset_gpu_application_clocks_status = reset_gpu_application_clocks(deviceInfo);
+        if (!reset_gpu_application_clocks(deviceInfo))
+        {
+            gmx_warning("Failed to reset GPU application clocks on GPU #%d", deviceInfo->id);
+        }
     }
 
     stat = cudaDeviceReset();
-    strncpy(result_str, cudaGetErrorString(stat), STRLEN);
-    return (stat == cudaSuccess) && reset_gpu_application_clocks_status;
+    if (stat != cudaSuccess)
+    {
+        gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
+    }
 }
 
 gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h
index 8f59630462..b1cc07b51f 100644
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -114,7 +114,6 @@ void free_gpu_info(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU
  * the patterns here are the same as elsewhere in this header.
  *
  *  param[in]    mdlog        log file to write to
- *  param[in]    rank         MPI rank of this process (for error output)
  * \param[inout] deviceInfo   device info of the GPU to initialize
  *
  * Issues a fatal error for any critical errors that occur during
@@ -122,7 +121,6 @@ void free_gpu_info(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU
  */
 GPU_FUNC_QUALIFIER
 void init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog),
-              int GPU_FUNC_ARGUMENT(rank),
               gmx_device_info_t *GPU_FUNC_ARGUMENT(deviceInfo)) GPU_FUNC_TERM
 
 /*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
@@ -130,15 +128,14 @@ void init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog),
  * The context is explicitly destroyed and therefore all data uploaded to the GPU
  * is lost. This should only be called when none of this data is required anymore.
  *
+ * Calls gmx_warning upon errors.
+ *
  * \param[in]  deviceInfo   device info of the GPU to clean up for
- * \param[out] result_str   the message related to the error that occurred
- *                          during the initialization (if there was any).
  *
  * \returns                 true if no error occurs during the freeing.
  */
 CUDA_FUNC_QUALIFIER
-gmx_bool free_cuda_gpu(const gmx_device_info_t *CUDA_FUNC_ARGUMENT(deviceInfo),
-                       char *CUDA_FUNC_ARGUMENT(result_str)) CUDA_FUNC_TERM_WITH_RETURN(TRUE)
+void free_gpu(const gmx_device_info_t *CUDA_FUNC_ARGUMENT(deviceInfo)) CUDA_FUNC_TERM
 
 /*! \brief Return a pointer to the device info for \c deviceId
  *
diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
index 5195baccc5..c121f813b3 100644
--- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
+++ b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
@@ -401,7 +401,6 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int ind
 
 //! This function is documented in the header file
 void init_gpu(const gmx::MDLogger               & /*mdlog*/,
-              int                               /* rank */,
               gmx_device_info_t                *deviceInfo)
 {
     assert(deviceInfo);
diff --git a/src/gromacs/mdlib/force.h b/src/gromacs/mdlib/force.h
index 87d4fe9496..2d83e17a25 100644
--- a/src/gromacs/mdlib/force.h
+++ b/src/gromacs/mdlib/force.h
@@ -216,7 +216,6 @@ void do_force_lowlevel(t_forcerec   *fr,
 /* Call all the force routines */
 
 void free_gpu_resources(const t_forcerec            *fr,
-                        const t_commrec             *cr,
-                        const gmx_device_info_t     *deviceInfo);
+                        const t_commrec             *cr);
 
 #endif
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp
index 05c59c2f29..0d9cfb49c7 100644
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -2134,19 +2134,15 @@ init_interaction_const(FILE                       *fp,
     *interaction_const = ic;
 }
 
-/* TODO deviceInfo should be logically const, but currently
- * init_gpu modifies it to set up NVML support. This could
- * happen during the detection phase, and deviceInfo could
- * the become const. */
-static void init_nb_verlet(const gmx::MDLogger &mdlog,
-                           nonbonded_verlet_t **nb_verlet,
-                           gmx_bool             bFEP_NonBonded,
-                           const t_inputrec    *ir,
-                           const t_forcerec    *fr,
-                           const t_commrec     *cr,
-                           gmx_device_info_t   *deviceInfo,
-                           const gmx_mtop_t    *mtop,
-                           matrix               box)
+static void init_nb_verlet(const gmx::MDLogger     &mdlog,
+                           nonbonded_verlet_t     **nb_verlet,
+                           gmx_bool                 bFEP_NonBonded,
+                           const t_inputrec        *ir,
+                           const t_forcerec        *fr,
+                           const t_commrec         *cr,
+                           const gmx_device_info_t *deviceInfo,
+                           const gmx_mtop_t        *mtop,
+                           matrix                   box)
 {
     nonbonded_verlet_t *nbv;
     char               *env;
@@ -2161,12 +2157,6 @@ static void init_nb_verlet(const gmx::MDLogger &mdlog,
 
     GMX_RELEASE_ASSERT(!(nbv->emulateGpu == EmulateGpuNonbonded::Yes && nbv->bUseGPU), "When GPU emulation is active, there cannot be a GPU assignment");
 
-    if (nbv->bUseGPU)
-    {
-        /* Use the assigned GPU. */
-        init_gpu(mdlog, cr->nodeid, deviceInfo);
-    }
-
     nbv->nbs             = nullptr;
     nbv->min_ci_balanced = 0;
 
@@ -2319,20 +2309,20 @@ gmx_bool usingGpu(nonbonded_verlet_t *nbv)
     return nbv != nullptr && nbv->bUseGPU;
 }
 
-void init_forcerec(FILE                *fp,
-                   const gmx::MDLogger &mdlog,
-                   t_forcerec          *fr,
-                   t_fcdata            *fcd,
-                   const t_inputrec    *ir,
-                   const gmx_mtop_t    *mtop,
-                   const t_commrec     *cr,
-                   matrix               box,
-                   const char          *tabfn,
-                   const char          *tabpfn,
-                   const t_filenm      *tabbfnm,
-                   gmx_device_info_t   *deviceInfo,
-                   gmx_bool             bNoSolvOpt,
-                   real                 print_force)
+void init_forcerec(FILE                    *fp,
+                   const gmx::MDLogger     &mdlog,
+                   t_forcerec              *fr,
+                   t_fcdata                *fcd,
+                   const t_inputrec        *ir,
+                   const gmx_mtop_t        *mtop,
+                   const t_commrec         *cr,
+                   matrix                   box,
+                   const char              *tabfn,
+                   const char              *tabpfn,
+                   const t_filenm          *tabbfnm,
+                   const gmx_device_info_t *deviceInfo,
+                   gmx_bool                 bNoSolvOpt,
+                   real                     print_force)
 {
     int            i, m, negp_pp, negptable, egi, egj;
     real           rtab;
@@ -3156,27 +3146,26 @@ void init_forcerec(FILE                *fp,
     }
 }
 
-/* Frees GPU memory and destroys the GPU context.
+/* Frees GPU memory and sets a tMPI node barrier.
  *
  * Note that this function needs to be called even if GPUs are not used
  * in this run because the PME ranks have no knowledge of whether GPUs
  * are used or not, but all ranks need to enter the barrier below.
+ * \todo Remove physical node barrier from this function after making sure
+ * that it's not needed anymore (with a shared GPU run).
  */
 void free_gpu_resources(const t_forcerec        *fr,
-                        const t_commrec         *cr,
-                        const gmx_device_info_t *deviceInfo)
+                        const t_commrec         *cr)
 {
-    gmx_bool bIsPPrankUsingGPU;
-    char     gpu_err_str[STRLEN];
+    bool isPPrankUsingGPU = fr && fr->nbv && fr->nbv->bUseGPU;
 
-    bIsPPrankUsingGPU = thisRankHasDuty(cr, DUTY_PP) && fr && fr->nbv && fr->nbv->bUseGPU;
+    /* stop the GPU profiler (only CUDA) */
+    stopGpuProfiler();
 
-    if (bIsPPrankUsingGPU)
+    if (isPPrankUsingGPU)
     {
         /* free nbnxn data in GPU memory */
         nbnxn_gpu_free(fr->nbv->gpu_nbv);
-        /* stop the GPU profiler (only CUDA) */
-        stopGpuProfiler();
     }
 
     /* With tMPI we need to wait for all ranks to finish deallocation before
@@ -3189,20 +3178,8 @@ void free_gpu_resources(const t_forcerec        *fr,
      * Note: it is safe to not call the barrier on the ranks which do not use GPU,
      * but it is easier and more futureproof to call it on the whole node.
      */
-#if GMX_THREAD_MPI
-    if (PAR(cr) || MULTISIM(cr))
+    if (GMX_THREAD_MPI && (PAR(cr) || MULTISIM(cr)))
     {
         gmx_barrier_physical_node(cr);
     }
-#endif  /* GMX_THREAD_MPI */
-
-    if (bIsPPrankUsingGPU)
-    {
-        /* uninitialize GPU (by destroying the context) */
-        if (!free_cuda_gpu(deviceInfo, gpu_err_str))
-        {
-            gmx_warning("On rank %d failed to free GPU #%d: %s",
-                        cr->nodeid, get_current_cuda_gpu_device_id(), gpu_err_str);
-        }
-    }
 }
diff --git a/src/gromacs/mdlib/forcerec.h b/src/gromacs/mdlib/forcerec.h
index 5dd6a73fca..9906a755a1 100644
--- a/src/gromacs/mdlib/forcerec.h
+++ b/src/gromacs/mdlib/forcerec.h
@@ -111,20 +111,20 @@ void init_interaction_const_tables(FILE                   *fp,
  * \param[in]  bNoSolvOpt  Do not use solvent optimization
  * \param[in]  print_force Print forces for atoms with force >= print_force
  */
-void init_forcerec(FILE                   *fplog,
-                   const gmx::MDLogger    &mdlog,
-                   t_forcerec             *fr,
-                   t_fcdata               *fcd,
-                   const t_inputrec       *ir,
-                   const gmx_mtop_t       *mtop,
-                   const t_commrec        *cr,
-                   matrix                  box,
-                   const char             *tabfn,
-                   const char             *tabpfn,
-                   const t_filenm         *tabbfnm,
-                   gmx_device_info_t      *deviceInfo,
-                   gmx_bool                bNoSolvOpt,
-                   real                    print_force);
+void init_forcerec(FILE                    *fplog,
+                   const gmx::MDLogger     &mdlog,
+                   t_forcerec              *fr,
+                   t_fcdata                *fcd,
+                   const t_inputrec        *ir,
+                   const gmx_mtop_t        *mtop,
+                   const t_commrec         *cr,
+                   matrix                   box,
+                   const char              *tabfn,
+                   const char              *tabpfn,
+                   const t_filenm          *tabbfnm,
+                   const gmx_device_info_t *deviceInfo,
+                   gmx_bool                 bNoSolvOpt,
+                   real                     print_force);
 
 /*! \brief Divide exclusions over threads
  *
diff --git a/src/gromacs/taskassignment/taskassignment.h b/src/gromacs/taskassignment/taskassignment.h
index 294230543e..62123e0824 100644
--- a/src/gromacs/taskassignment/taskassignment.h
+++ b/src/gromacs/taskassignment/taskassignment.h
@@ -118,6 +118,13 @@ runTaskAssignment(const std::vector<int>     &gpuIdsToUse,
                   const t_commrec            *cr,
                   const std::vector<GpuTask> &gpuTasksOnThisRank);
 
+//! Function for whether the task of \c mapping has value \c TaskType.
+template<GpuTask TaskType>
+bool hasTaskType(const GpuTaskMapping &mapping)
+{
+    return mapping.task_ == TaskType;
+}
+
 } // namespace
 
 #endif
diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp
index 53c55ff09f..b266ecb415 100644
--- a/src/programs/mdrun/runner.cpp
+++ b/src/programs/mdrun/runner.cpp
@@ -84,6 +84,7 @@
 #include "gromacs/mdlib/mdrun.h"
 #include "gromacs/mdlib/minimize.h"
 #include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
 #include "gromacs/mdlib/nbnxn_search.h"
 #include "gromacs/mdlib/nbnxn_tuning.h"
 #include "gromacs/mdlib/qmmm.h"
@@ -1009,22 +1010,25 @@ int Mdrunner::mdrunner()
                                        cr, mdlog);
 
     gmx_device_info_t *nonbondedDeviceInfo = nullptr;
-    int                nonbondedDeviceId   = -1;
+
     if (thisRankHasDuty(cr, DUTY_PP))
     {
-        if (!gpuTaskAssignment.empty())
+        // This works because only one task of each type is currently permitted.
+        auto nbGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(),
+                                             hasTaskType<GpuTask::Nonbonded>);
+        if (nbGpuTaskMapping != gpuTaskAssignment.end())
         {
-            GMX_RELEASE_ASSERT(gpuTaskAssignment.size() == 1, "A valid GPU assignment can only have one task per rank");
-            GMX_RELEASE_ASSERT(gpuTaskAssignment[0].task_ == gmx::GpuTask::Nonbonded, "A valid GPU assignment can only include short-ranged tasks");
-            nonbondedDeviceId   = gpuTaskAssignment[0].deviceId_;
+            int nonbondedDeviceId = nbGpuTaskMapping->deviceId_;
             nonbondedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, nonbondedDeviceId);
-        }
-    }
+            init_gpu(mdlog, nonbondedDeviceInfo);
 
-    if (DOMAINDECOMP(cr))
-    {
-        /* When we share GPUs over ranks, we need to know this for the DLB */
-        dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId);
+            if (DOMAINDECOMP(cr))
+            {
+                /* When we share GPUs over ranks, we need to know this for the DLB */
+                dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId);
+            }
+
+        }
     }
 
     /* getting number of PP/PME threads
@@ -1306,8 +1310,16 @@ int Mdrunner::mdrunner()
         pmedata = nullptr;
     }
 
-    /* Free GPU memory and context */
-    free_gpu_resources(fr, cr, nonbondedDeviceInfo);
+    // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
+    // before we destroy the GPU context(s) in free_gpu_resources().
+    // Pinned buffers are associated with contexts in CUDA.
+    // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
+    mdAtoms.reset(nullptr);
+    globalState.reset(nullptr);
+
+    /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
+    free_gpu_resources(fr, cr);
+    free_gpu(nonbondedDeviceInfo);
 
     if (doMembed)
     {
-- 
2.22.0