enable GPU sharing among tMPI ranks

author Szilard Pall <pall.szilard@gmail.com>

Fri, 4 Oct 2013 00:01:44 +0000 (02:01 +0200)

committer Berk Hess <hess@kth.se>

Wed, 6 Nov 2013 09:59:40 +0000 (10:59 +0100)
author Szilard Pall <pall.szilard@gmail.com>
Fri, 4 Oct 2013 00:01:44 +0000 (02:01 +0200)
committer Berk Hess <hess@kth.se>
Wed, 6 Nov 2013 09:59:40 +0000 (10:59 +0100)
diff --git a/include/force.h b/include/force.h

index 9dc45af3a3fd11acfa831fd4b5f7c0c597312f6a..cc695c6be457d33e387d87d6eee37b7f4a4340ca 100644 (file)
--- a/include/force.h
+++ b/include/force.h
@@ -160,14 +160,6 @@ void init_interaction_const_tables(FILE                *fp,
   * use with group kernels.
   */
  
-void init_interaction_const(FILE                 *fp,
-                            interaction_const_t **interaction_const,
-                            const t_forcerec     *fr,
-                            real                  rtab);
-/* Initializes the interaction constant data structure. Currently it
- * uses forcerec as input.
- */
-
  GMX_LIBMD_EXPORT
  void init_forcerec(FILE              *fplog,
                     const output_env_t oenv,
diff --git a/include/network.h b/include/network.h

index 1cc018fdfd5d20b3430330285cb97bb5b59ebeb1..6cfff06ac6c782f06c8ca9dc2186e80e9943a078 100644 (file)
--- a/include/network.h
+++ b/include/network.h
@@ -94,6 +94,7 @@ gmx_bool gmx_mpi_initialized(void);
   * when GROMACS was compiled without MPI support.
   */
  
+GMX_LIBGMX_EXPORT
  void gmx_barrier(const t_commrec *cr);
  /* Wait till all processes in cr->mpi_comm_mygroup have reached the barrier */
  
diff --git a/src/gmxlib/gmx_detect_hardware.c b/src/gmxlib/gmx_detect_hardware.c

index 2a3156eeaad73e0b45f850ce9fdc10c659a36ad6..e918a78c213e7a5b418a33f16d04637f2d088860 100644 (file)
--- a/src/gmxlib/gmx_detect_hardware.c
+++ b/src/gmxlib/gmx_detect_hardware.c
@@ -374,19 +374,11 @@ void gmx_check_hw_runconf_consistency(FILE *fplog,
  
              same_count = gmx_count_gpu_dev_shared(&hw_opt->gpu_opt);
  
-            if (btMPI && same_count > 0)
-            {
-                gmx_fatal(FARGS,
-                          "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
-                          "Use MPI if you are sure that you want to assign a GPU to multiple threads.");
-            }
-
              if (same_count > 0)
              {
                  md_print_warn(cr, fplog,
                                "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
-                              "      multiple %s%s; this should be avoided as it can cause\n"
-                              "      performance loss.\n",
+                              "      multiple %s%s; this can cause performance loss.\n",
                                same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
              }
          }
diff --git a/src/kernel/pme_loadbal.c b/src/kernel/pme_loadbal.c

index d1a2f8efee5e71e259e1c37c0bf41f1a7975ca9b..9700ceb147787087cfa759f15c501a4102bbf4fc 100644 (file)
--- a/src/kernel/pme_loadbal.c
+++ b/src/kernel/pme_loadbal.c
@@ -669,6 +669,25 @@ gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
          nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
      {
          nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv, ic);
+
+        /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+         * also sharing texture references. To keep the code simple, we don't
+         * treat texture references as shared resources, but this means that
+         * the coulomb_tab texture ref will get updated by multiple threads.
+         * Hence, to ensure that the non-bonded kernels don't start before all
+         * texture binding operations are finished, we need to wait for all ranks
+         * to arrive here before continuing.
+         *
+         * Note that we could omit this barrier if GPUs are not shared (or
+         * texture objects are used), but as this is initialization code, there
+         * is not point in complicating things.
+         */
+#ifdef GMX_THREAD_MPI
+        if (PAR(cr))
+        {
+            gmx_barrier(cr);
+        }
+#endif /* GMX_THREAD_MPI */
      }
      else
      {
diff --git a/src/kernel/runner.c b/src/kernel/runner.c

index 3665c4740619a783fbd7936e0ed89e4fbb156acc..42ef770eda2eb4d88ce4ce3f2c2a97fb661e98c2 100644 (file)
--- a/src/kernel/runner.c
+++ b/src/kernel/runner.c
@@ -960,6 +960,48 @@ static void override_nsteps_cmdline(FILE            *fplog,
      }
  }
  
+/* Frees GPU memory and destroys the CUDA context.
+ *
+ * Note that this function needs to be called even if GPUs are not used
+ * in this run because the PME ranks have no knowledge of whether GPUs
+ * are used or not, but all ranks need to enter the barrier below.
+ */
+static void free_gpu_resources(FILE             *fplog,
+                               const t_forcerec *fr,
+                               const t_commrec  *cr)
+{
+    gmx_bool bIsPPrankUsingGPU;
+    char     gpu_err_str[STRLEN];
+
+    bIsPPrankUsingGPU = (cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU;
+
+    if (bIsPPrankUsingGPU)
+    {
+        /* free nbnxn data in GPU memory */
+        nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
+
+        /* With tMPI we need to wait for all ranks to finish deallocation before
+         * destroying the context in free_gpu() as some ranks may be sharing
+         * GPU and context.
+         * Note: as only PP ranks need to free GPU resources, so it is safe to
+         * not call the barrier on PME ranks.
+         */
+#ifdef GMX_THREAD_MPI
+        if (PAR(cr))
+        {
+            gmx_barrier(cr);
+        }
+#endif /* GMX_THREAD_MPI */
+
+        /* uninitialize GPU (by destroying the context) */
+        if (!free_gpu(gpu_err_str))
+        {
+            gmx_warning("On node %d failed to free GPU #%d: %s",
+                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+        }
+    }
+}
+
  int mdrunner(gmx_hw_opt_t *hw_opt,
               FILE *fplog, t_commrec *cr, int nfile,
               const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
@@ -1703,19 +1745,9 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
                 nthreads_pp,
                 EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
  
-    if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
-    {
-        char gpu_err_str[STRLEN];
-
-        /* free GPU memory and uninitialize GPU (by destroying the context) */
-        nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
  
-        if (!free_gpu(gpu_err_str))
-        {
-            gmx_warning("On node %d failed to free GPU #%d: %s",
-                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
-        }
-    }
+    /* Free GPU memory and context */
+    free_gpu_resources(fplog, fr, cr);
  
      if (opt2bSet("-membed", nfile, fnm))
      {
diff --git a/src/mdlib/forcerec.c b/src/mdlib/forcerec.c

index f7da120aca08c5c7fc1f85ae4d63ec03fa2f4177..6a1f9db5868e6b9da4ba824599bf726a9685762e 100644 (file)
--- a/src/mdlib/forcerec.c
+++ b/src/mdlib/forcerec.c
@@ -1788,10 +1788,11 @@ void init_interaction_const_tables(FILE                *fp,
      }
  }
  
-void init_interaction_const(FILE                 *fp,
-                            interaction_const_t **interaction_const,
-                            const t_forcerec     *fr,
-                            real                  rtab)
+static void init_interaction_const(FILE                 *fp,
+                                   const t_commrec      *cr,
+                                   interaction_const_t **interaction_const,
+                                   const t_forcerec     *fr,
+                                   real                  rtab)
  {
      interaction_const_t *ic;
      gmx_bool             bUsesSimpleTables = TRUE;
@@ -1876,6 +1877,25 @@ void init_interaction_const(FILE                 *fp,
      if (fr->nbv != NULL && fr->nbv->bUseGPU)
      {
          nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv->grp);
+
+        /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+         * also sharing texture references. To keep the code simple, we don't
+         * treat texture references as shared resources, but this means that
+         * the coulomb_tab and nbfp texture refs will get updated by multiple threads.
+         * Hence, to ensure that the non-bonded kernels don't start before all
+         * texture binding operations are finished, we need to wait for all ranks
+         * to arrive here before continuing.
+         *
+         * Note that we could omit this barrier if GPUs are not shared (or
+         * texture objects are used), but as this is initialization code, there
+         * is not point in complicating things.
+         */
+#ifdef GMX_THREAD_MPI
+        if (PAR(cr))
+        {
+            gmx_barrier(cr);
+        }
+#endif /* GMX_THREAD_MPI */
      }
  
      bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
@@ -2905,7 +2925,8 @@ void init_forcerec(FILE              *fp,
      }
  
      /* fr->ic is used both by verlet and group kernels (to some extent) now */
-    init_interaction_const(fp, &fr->ic, fr, rtab);
+    init_interaction_const(fp, cr, &fr->ic, fr, rtab);
+
      if (ir->eDispCorr != edispcNO)
      {
          calc_enervirdiff(fp, ir->eDispCorr, fr);
author	Szilard Pall <pall.szilard@gmail.com>
	Fri, 4 Oct 2013 00:01:44 +0000 (02:01 +0200)
committer	Berk Hess <hess@kth.se>
	Wed, 6 Nov 2013 09:59:40 +0000 (10:59 +0100)
include/force.h		patch \| blob \| history
include/network.h		patch \| blob \| history
src/gmxlib/gmx_detect_hardware.c		patch \| blob \| history
src/kernel/pme_loadbal.c		patch \| blob \| history
src/kernel/runner.c		patch \| blob \| history
src/mdlib/forcerec.c		patch \| blob \| history