Separate management of GPU contexts from modules

author Mark Abraham <mark.j.abraham@gmail.com>

Fri, 3 Nov 2017 01:33:01 +0000 (02:33 +0100)

committer Aleksei Iupinov <a.yupinov@gmail.com>

Fri, 24 Nov 2017 14:16:05 +0000 (15:16 +0100)
author Mark Abraham <mark.j.abraham@gmail.com>
Fri, 3 Nov 2017 01:33:01 +0000 (02:33 +0100)
committer Aleksei Iupinov <a.yupinov@gmail.com>
Fri, 24 Nov 2017 14:16:05 +0000 (15:16 +0100)
diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp

index 6e1d9992bede96cabfd1da6548348564160a8ca9..9487fd0c5de7bd9a364b7b87ed50fb123480822b 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.cpp
+++ b/src/gromacs/ewald/pme-gpu-internal.cpp
@@ -282,11 +282,8 @@ static bool pme_gpu_check_restrictions(const gmx_pme_t *pme, std::string *error)
   *
   * \param[in,out] pme       The PME structure.
   * \param[in,out] gpuInfo   The GPU information structure.
- * \param[in]     mdlog     The logger.
- * \param[in]     cr        The communication structure.
   */
-static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog,
-                         const t_commrec *cr)
+static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
  {
      std::string errorString;
      bool        canRunOnGpu = pme_gpu_check_restrictions(pme, &errorString);
@@ -308,8 +305,6 @@ static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::
  
      pme_gpu_set_testing(pmeGPU, false);
  
-    // GPU initialization
-    init_gpu(mdlog, cr->nodeid, gpuInfo);
      pmeGPU->deviceInfo = gpuInfo;
  
      pme_gpu_init_internal(pmeGPU);
@@ -395,7 +390,7 @@ void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGPU, gmx::IVec *gridSize, gmx:
      }
  }
  
-void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr)
+void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
  {
      if (!pme_gpu_active(pme))
      {
@@ -405,7 +400,7 @@ void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLog
      if (!pme->gpu)
      {
          /* First-time initialization */
-        pme_gpu_init(pme, gpuInfo, mdlog, cr);
+        pme_gpu_init(pme, gpuInfo);
      }
      else
      {
diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h

index be50ead6afc23a28f83b146608752f9a691cf98b..33c83103a852b2f22f1f4c4a632074c5c5dabeb5 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.h
+++ b/src/gromacs/ewald/pme-gpu-internal.h
@@ -55,7 +55,6 @@
  struct gmx_hw_info_t;
  struct gmx_gpu_opt_t;
  struct gmx_pme_t;                              // only used in pme_gpu_reinit
-struct t_commrec;
  struct gmx_wallclock_gpu_pme_t;
  struct pme_atomcomm_t;
  struct t_complex;
@@ -636,11 +635,9 @@ void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGPU, gmx::IVec *gridSize, gmx:
   *
   * \param[in,out] pme       The PME structure.
   * \param[in,out] gpuInfo   The GPU information structure.
- * \param[in]     mdlog     The logger.
- * \param[in]     cr        The communication structure.
   * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
   */
-void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr);
+void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo);
  
  /*! \libinternal \brief
   * Destroys the PME GPU data at the end of the run.
diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp

index 344d8553cc5d162b2a93a1d93e34d5101dc5bc94..c90254b0a34c7e16d0f66a692706eb0b7434cccc 100644 (file)
--- a/src/gromacs/ewald/pme.cpp
+++ b/src/gromacs/ewald/pme.cpp
@@ -520,7 +520,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
                          PmeRunMode           runMode,
                          PmeGpu              *pmeGPU,
                          gmx_device_info_t   *gpuInfo,
-                        const gmx::MDLogger &mdlog)
+                        const gmx::MDLogger  & /*mdlog*/)
  {
      int               use_threads, sum_use_threads, i;
      ivec              ndata;
@@ -853,7 +853,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec     *cr,
      pme->lb_buf2       = nullptr;
      pme->lb_buf_nalloc = 0;
  
-    pme_gpu_reinit(pme.get(), gpuInfo, mdlog, cr);
+    pme_gpu_reinit(pme.get(), gpuInfo);
  
      pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
  
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp

index f7c7b0f141f48f9b660da98ac8e99321085d5d70..f9ea663ce6f292348b84857e0bfcb009c90bb3a6 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -52,6 +52,7 @@
  #include "gromacs/ewald/pme-solve.h"
  #include "gromacs/ewald/pme-spread.h"
  #include "gromacs/fft/parallel_3dfft.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/math/invertmatrix.h"
  #include "gromacs/mdtypes/commrec.h"
  #include "gromacs/pbcutil/pbc.h"
@@ -103,6 +104,10 @@ static PmeSafePointer pmeInitInternal(const t_inputrec         *inputRec,
                                        )
  {
      const MDLogger dummyLogger;
+    if (gpuInfo)
+    {
+        init_gpu(dummyLogger, gpuInfo);
+    }
      const auto     runMode       = (mode == CodePath::CPU) ? PmeRunMode::CPU : PmeRunMode::GPU;
      t_commrec      dummyCommrec  = {0};
      gmx_pme_t     *pmeDataRaw    = gmx_pme_init(&dummyCommrec, 1, 1, inputRec, atomCount, false, false, true,
diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu

index fb509f5d37e16b964b275d4a828796948d6274b7..4e2200794089385cc0310117b7e24ca7064de543 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -55,11 +55,13 @@
  #include "gromacs/hardware/gpu_hw_info.h"
  #include "gromacs/utility/basedefinitions.h"
  #include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/logger.h"
  #include "gromacs/utility/programcontext.h"
  #include "gromacs/utility/smalloc.h"
  #include "gromacs/utility/snprintf.h"
+#include "gromacs/utility/stringutil.h"
  
  #if HAVE_NVML
  #include <nvml.h>
@@ -507,20 +509,18 @@ static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused
  #endif /* HAVE_NVML_APPLICATION_CLOCKS */
  }
  
-void init_gpu(const gmx::MDLogger &mdlog, int rank,
-              gmx_device_info_t *deviceInfo)
+void init_gpu(const gmx::MDLogger &mdlog,
+              gmx_device_info_t   *deviceInfo)
  {
      cudaError_t stat;
-    char        sbuf[STRLEN];
  
      assert(deviceInfo);
  
      stat = cudaSetDevice(deviceInfo->id);
      if (stat != cudaSuccess)
      {
-        snprintf(sbuf, STRLEN, "On rank %d failed to initialize GPU #%d",
-                 rank, deviceInfo->id);
-        CU_RET_ERR(stat, sbuf);
+        auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
+        CU_RET_ERR(stat, message.c_str());
      }
  
      if (debug)
@@ -534,13 +534,9 @@ void init_gpu(const gmx::MDLogger &mdlog, int rank,
      init_gpu_application_clocks(mdlog, deviceInfo);
  }
  
-gmx_bool free_cuda_gpu(const gmx_device_info_t *deviceInfo,
-                       char                    *result_str)
+void free_gpu(const gmx_device_info_t *deviceInfo)
  {
      cudaError_t  stat;
-    gmx_bool     reset_gpu_application_clocks_status = true;
-
-    assert(result_str);
  
      if (debug)
      {
@@ -552,12 +548,17 @@ gmx_bool free_cuda_gpu(const gmx_device_info_t *deviceInfo,
  
      if (deviceInfo != nullptr)
      {
-        reset_gpu_application_clocks_status = reset_gpu_application_clocks(deviceInfo);
+        if (!reset_gpu_application_clocks(deviceInfo))
+        {
+            gmx_warning("Failed to reset GPU application clocks on GPU #%d", deviceInfo->id);
+        }
      }
  
      stat = cudaDeviceReset();
-    strncpy(result_str, cudaGetErrorString(stat), STRLEN);
-    return (stat == cudaSuccess) && reset_gpu_application_clocks_status;
+    if (stat != cudaSuccess)
+    {
+        gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
+    }
  }
  
  gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h

index 8f596304629b040b187576636d7bf997cdaf5a78..b1cc07b51f9b656dcd9d46f6575537638ae33d93 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -114,7 +114,6 @@ void free_gpu_info(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU
   * the patterns here are the same as elsewhere in this header.
   *
   *  param[in]    mdlog        log file to write to
- *  param[in]    rank         MPI rank of this process (for error output)
   * \param[inout] deviceInfo   device info of the GPU to initialize
   *
   * Issues a fatal error for any critical errors that occur during
@@ -122,7 +121,6 @@ void free_gpu_info(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU
   */
  GPU_FUNC_QUALIFIER
  void init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog),
-              int GPU_FUNC_ARGUMENT(rank),
                gmx_device_info_t *GPU_FUNC_ARGUMENT(deviceInfo)) GPU_FUNC_TERM
  
  /*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
@@ -130,15 +128,14 @@ void init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog),
   * The context is explicitly destroyed and therefore all data uploaded to the GPU
   * is lost. This should only be called when none of this data is required anymore.
   *
+ * Calls gmx_warning upon errors.
+ *
   * \param[in]  deviceInfo   device info of the GPU to clean up for
- * \param[out] result_str   the message related to the error that occurred
- *                          during the initialization (if there was any).
   *
   * \returns                 true if no error occurs during the freeing.
   */
  CUDA_FUNC_QUALIFIER
-gmx_bool free_cuda_gpu(const gmx_device_info_t *CUDA_FUNC_ARGUMENT(deviceInfo),
-                       char *CUDA_FUNC_ARGUMENT(result_str)) CUDA_FUNC_TERM_WITH_RETURN(TRUE)
+void free_gpu(const gmx_device_info_t *CUDA_FUNC_ARGUMENT(deviceInfo)) CUDA_FUNC_TERM
  
  /*! \brief Return a pointer to the device info for \c deviceId
   *
diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp

index 5195baccc5d97ffcede987b83d7eb21aa52d1bf4..c121f813b3904ed3870f1307c4acbd9682e92fc3 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
+++ b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
@@ -401,7 +401,6 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int ind
  
  //! This function is documented in the header file
  void init_gpu(const gmx::MDLogger               & /*mdlog*/,
-              int                               /* rank */,
                gmx_device_info_t                *deviceInfo)
  {
      assert(deviceInfo);
diff --git a/src/gromacs/mdlib/force.h b/src/gromacs/mdlib/force.h

index 87d4fe94961a048b1ad5bbd594fbac7285cf654b..2d83e17a252b8e66234483d3d2f65fa0732f01ed 100644 (file)
--- a/src/gromacs/mdlib/force.h
+++ b/src/gromacs/mdlib/force.h
@@ -216,7 +216,6 @@ void do_force_lowlevel(t_forcerec   *fr,
  /* Call all the force routines */
  
  void free_gpu_resources(const t_forcerec            *fr,
-                        const t_commrec             *cr,
-                        const gmx_device_info_t     *deviceInfo);
+                        const t_commrec             *cr);
  
  #endif
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp

index 05c59c2f2937752d83b48a44dfdab5bb5191519e..0d9cfb49c7412291e6695655c50b1d10d95697fe 100644 (file)
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -2134,19 +2134,15 @@ init_interaction_const(FILE                       *fp,
      *interaction_const = ic;
  }
  
-/* TODO deviceInfo should be logically const, but currently
- * init_gpu modifies it to set up NVML support. This could
- * happen during the detection phase, and deviceInfo could
- * the become const. */
-static void init_nb_verlet(const gmx::MDLogger &mdlog,
-                           nonbonded_verlet_t **nb_verlet,
-                           gmx_bool             bFEP_NonBonded,
-                           const t_inputrec    *ir,
-                           const t_forcerec    *fr,
-                           const t_commrec     *cr,
-                           gmx_device_info_t   *deviceInfo,
-                           const gmx_mtop_t    *mtop,
-                           matrix               box)
+static void init_nb_verlet(const gmx::MDLogger     &mdlog,
+                           nonbonded_verlet_t     **nb_verlet,
+                           gmx_bool                 bFEP_NonBonded,
+                           const t_inputrec        *ir,
+                           const t_forcerec        *fr,
+                           const t_commrec         *cr,
+                           const gmx_device_info_t *deviceInfo,
+                           const gmx_mtop_t        *mtop,
+                           matrix                   box)
  {
      nonbonded_verlet_t *nbv;
      char               *env;
@@ -2161,12 +2157,6 @@ static void init_nb_verlet(const gmx::MDLogger &mdlog,
  
      GMX_RELEASE_ASSERT(!(nbv->emulateGpu == EmulateGpuNonbonded::Yes && nbv->bUseGPU), "When GPU emulation is active, there cannot be a GPU assignment");
  
-    if (nbv->bUseGPU)
-    {
-        /* Use the assigned GPU. */
-        init_gpu(mdlog, cr->nodeid, deviceInfo);
-    }
-
      nbv->nbs             = nullptr;
      nbv->min_ci_balanced = 0;
  
@@ -2319,20 +2309,20 @@ gmx_bool usingGpu(nonbonded_verlet_t *nbv)
      return nbv != nullptr && nbv->bUseGPU;
  }
  
-void init_forcerec(FILE                *fp,
-                   const gmx::MDLogger &mdlog,
-                   t_forcerec          *fr,
-                   t_fcdata            *fcd,
-                   const t_inputrec    *ir,
-                   const gmx_mtop_t    *mtop,
-                   const t_commrec     *cr,
-                   matrix               box,
-                   const char          *tabfn,
-                   const char          *tabpfn,
-                   const t_filenm      *tabbfnm,
-                   gmx_device_info_t   *deviceInfo,
-                   gmx_bool             bNoSolvOpt,
-                   real                 print_force)
+void init_forcerec(FILE                    *fp,
+                   const gmx::MDLogger     &mdlog,
+                   t_forcerec              *fr,
+                   t_fcdata                *fcd,
+                   const t_inputrec        *ir,
+                   const gmx_mtop_t        *mtop,
+                   const t_commrec         *cr,
+                   matrix                   box,
+                   const char              *tabfn,
+                   const char              *tabpfn,
+                   const t_filenm          *tabbfnm,
+                   const gmx_device_info_t *deviceInfo,
+                   gmx_bool                 bNoSolvOpt,
+                   real                     print_force)
  {
      int            i, m, negp_pp, negptable, egi, egj;
      real           rtab;
@@ -3156,27 +3146,26 @@ void init_forcerec(FILE                *fp,
      }
  }
  
-/* Frees GPU memory and destroys the GPU context.
+/* Frees GPU memory and sets a tMPI node barrier.
   *
   * Note that this function needs to be called even if GPUs are not used
   * in this run because the PME ranks have no knowledge of whether GPUs
   * are used or not, but all ranks need to enter the barrier below.
+ * \todo Remove physical node barrier from this function after making sure
+ * that it's not needed anymore (with a shared GPU run).
   */
  void free_gpu_resources(const t_forcerec        *fr,
-                        const t_commrec         *cr,
-                        const gmx_device_info_t *deviceInfo)
+                        const t_commrec         *cr)
  {
-    gmx_bool bIsPPrankUsingGPU;
-    char     gpu_err_str[STRLEN];
+    bool isPPrankUsingGPU = fr && fr->nbv && fr->nbv->bUseGPU;
  
-    bIsPPrankUsingGPU = thisRankHasDuty(cr, DUTY_PP) && fr && fr->nbv && fr->nbv->bUseGPU;
+    /* stop the GPU profiler (only CUDA) */
+    stopGpuProfiler();
  
-    if (bIsPPrankUsingGPU)
+    if (isPPrankUsingGPU)
      {
          /* free nbnxn data in GPU memory */
          nbnxn_gpu_free(fr->nbv->gpu_nbv);
-        /* stop the GPU profiler (only CUDA) */
-        stopGpuProfiler();
      }
  
      /* With tMPI we need to wait for all ranks to finish deallocation before
@@ -3189,20 +3178,8 @@ void free_gpu_resources(const t_forcerec        *fr,
       * Note: it is safe to not call the barrier on the ranks which do not use GPU,
       * but it is easier and more futureproof to call it on the whole node.
       */
-#if GMX_THREAD_MPI
-    if (PAR(cr) || MULTISIM(cr))
+    if (GMX_THREAD_MPI && (PAR(cr) || MULTISIM(cr)))
      {
          gmx_barrier_physical_node(cr);
      }
-#endif  /* GMX_THREAD_MPI */
-
-    if (bIsPPrankUsingGPU)
-    {
-        /* uninitialize GPU (by destroying the context) */
-        if (!free_cuda_gpu(deviceInfo, gpu_err_str))
-        {
-            gmx_warning("On rank %d failed to free GPU #%d: %s",
-                        cr->nodeid, get_current_cuda_gpu_device_id(), gpu_err_str);
-        }
-    }
  }
diff --git a/src/gromacs/mdlib/forcerec.h b/src/gromacs/mdlib/forcerec.h

index 5dd6a73fca4cfed23357d5271bdd3ddceea521a3..9906a755a118d0eaf068a11dcff4a947004dfed6 100644 (file)
--- a/src/gromacs/mdlib/forcerec.h
+++ b/src/gromacs/mdlib/forcerec.h
@@ -111,20 +111,20 @@ void init_interaction_const_tables(FILE                   *fp,
   * \param[in]  bNoSolvOpt  Do not use solvent optimization
   * \param[in]  print_force Print forces for atoms with force >= print_force
   */
-void init_forcerec(FILE                   *fplog,
-                   const gmx::MDLogger    &mdlog,
-                   t_forcerec             *fr,
-                   t_fcdata               *fcd,
-                   const t_inputrec       *ir,
-                   const gmx_mtop_t       *mtop,
-                   const t_commrec        *cr,
-                   matrix                  box,
-                   const char             *tabfn,
-                   const char             *tabpfn,
-                   const t_filenm         *tabbfnm,
-                   gmx_device_info_t      *deviceInfo,
-                   gmx_bool                bNoSolvOpt,
-                   real                    print_force);
+void init_forcerec(FILE                    *fplog,
+                   const gmx::MDLogger     &mdlog,
+                   t_forcerec              *fr,
+                   t_fcdata                *fcd,
+                   const t_inputrec        *ir,
+                   const gmx_mtop_t        *mtop,
+                   const t_commrec         *cr,
+                   matrix                   box,
+                   const char              *tabfn,
+                   const char              *tabpfn,
+                   const t_filenm          *tabbfnm,
+                   const gmx_device_info_t *deviceInfo,
+                   gmx_bool                 bNoSolvOpt,
+                   real                     print_force);
  
  /*! \brief Divide exclusions over threads
   *
diff --git a/src/gromacs/taskassignment/taskassignment.h b/src/gromacs/taskassignment/taskassignment.h

index 294230543ebb3ff69430fd71983ba6cfb5139ecc..62123e082405d0935ecf228398de2552584a02c2 100644 (file)
--- a/src/gromacs/taskassignment/taskassignment.h
+++ b/src/gromacs/taskassignment/taskassignment.h
@@ -118,6 +118,13 @@ runTaskAssignment(const std::vector<int>     &gpuIdsToUse,
                    const t_commrec            *cr,
                    const std::vector<GpuTask> &gpuTasksOnThisRank);
  
+//! Function for whether the task of \c mapping has value \c TaskType.
+template<GpuTask TaskType>
+bool hasTaskType(const GpuTaskMapping &mapping)
+{
+    return mapping.task_ == TaskType;
+}
+
  } // namespace
  
  #endif
diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp

index 53c55ff09fe9d25ef69b513ab9e31902911f42d8..b266ecb41547f674dba7523d96b1dc15fe5255b3 100644 (file)
--- a/src/programs/mdrun/runner.cpp
+++ b/src/programs/mdrun/runner.cpp
@@ -84,6 +84,7 @@
  #include "gromacs/mdlib/mdrun.h"
  #include "gromacs/mdlib/minimize.h"
  #include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  #include "gromacs/mdlib/nbnxn_search.h"
  #include "gromacs/mdlib/nbnxn_tuning.h"
  #include "gromacs/mdlib/qmmm.h"
@@ -1009,22 +1010,25 @@ int Mdrunner::mdrunner()
                                         cr, mdlog);
  
      gmx_device_info_t *nonbondedDeviceInfo = nullptr;
-    int                nonbondedDeviceId   = -1;
+
      if (thisRankHasDuty(cr, DUTY_PP))
      {
-        if (!gpuTaskAssignment.empty())
+        // This works because only one task of each type is currently permitted.
+        auto nbGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(),
+                                             hasTaskType<GpuTask::Nonbonded>);
+        if (nbGpuTaskMapping != gpuTaskAssignment.end())
          {
-            GMX_RELEASE_ASSERT(gpuTaskAssignment.size() == 1, "A valid GPU assignment can only have one task per rank");
-            GMX_RELEASE_ASSERT(gpuTaskAssignment[0].task_ == gmx::GpuTask::Nonbonded, "A valid GPU assignment can only include short-ranged tasks");
-            nonbondedDeviceId   = gpuTaskAssignment[0].deviceId_;
+            int nonbondedDeviceId = nbGpuTaskMapping->deviceId_;
              nonbondedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, nonbondedDeviceId);
-        }
-    }
+            init_gpu(mdlog, nonbondedDeviceInfo);
  
-    if (DOMAINDECOMP(cr))
-    {
-        /* When we share GPUs over ranks, we need to know this for the DLB */
-        dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId);
+            if (DOMAINDECOMP(cr))
+            {
+                /* When we share GPUs over ranks, we need to know this for the DLB */
+                dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId);
+            }
+
+        }
      }
  
      /* getting number of PP/PME threads
@@ -1306,8 +1310,16 @@ int Mdrunner::mdrunner()
          pmedata = nullptr;
      }
  
-    /* Free GPU memory and context */
-    free_gpu_resources(fr, cr, nonbondedDeviceInfo);
+    // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
+    // before we destroy the GPU context(s) in free_gpu_resources().
+    // Pinned buffers are associated with contexts in CUDA.
+    // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
+    mdAtoms.reset(nullptr);
+    globalState.reset(nullptr);
+
+    /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
+    free_gpu_resources(fr, cr);
+    free_gpu(nonbondedDeviceInfo);
  
      if (doMembed)
      {
author	Mark Abraham <mark.j.abraham@gmail.com>
	Fri, 3 Nov 2017 01:33:01 +0000 (02:33 +0100)
committer	Aleksei Iupinov <a.yupinov@gmail.com>
	Fri, 24 Nov 2017 14:16:05 +0000 (15:16 +0100)
src/gromacs/ewald/pme-gpu-internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme-gpu-internal.h		patch \| blob \| history
src/gromacs/ewald/pme.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmetestcommon.cpp		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils.cu		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils.h		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils_ocl.cpp		patch \| blob \| history
src/gromacs/mdlib/force.h		patch \| blob \| history
src/gromacs/mdlib/forcerec.cpp		patch \| blob \| history
src/gromacs/mdlib/forcerec.h		patch \| blob \| history
src/gromacs/taskassignment/taskassignment.h		patch \| blob \| history
src/programs/mdrun/runner.cpp		patch \| blob \| history