From: Szilard Pall <pall.szilard@gmail.com>
Date: Fri, 4 Oct 2013 00:01:44 +0000 (+0200)
Subject: enable GPU sharing among tMPI ranks
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=726a885cae86bd7598f9abdbccce02948435ed2f;p=alexxy%2Fgromacs.git

enable GPU sharing among tMPI ranks

It turns out that the only issue preventing sharing GPUs among thread-MPI
threads was that when the thread arriving to free_gpu() first destroys
the context, it is highly likely that the other thread(s) sharing a GPU
with this are still freeing their resources - operation which fails as
soon as the context is destroyed by the "fast" thread.

Simply placing a barrier between the GPU resource freeing and context
destruction solves the issue. However, there is still a very unlikely
concurrency hazard after CUDA texture reference updates (non-bonded
parameter table and coulomb force table initialization). To be on the
safe side, with tMPI a barrier is placed after these operations.

Change-Id: Iac7a39f841ca31a32ab979ee0012cfc18a811d76
---

diff --git a/include/force.h b/include/force.h
index 9dc45af3a3..cc695c6be4 100644
--- a/include/force.h
+++ b/include/force.h
@@ -160,14 +160,6 @@ void init_interaction_const_tables(FILE                *fp,
  * use with group kernels.
  */
 
-void init_interaction_const(FILE                 *fp,
-                            interaction_const_t **interaction_const,
-                            const t_forcerec     *fr,
-                            real                  rtab);
-/* Initializes the interaction constant data structure. Currently it
- * uses forcerec as input.
- */
-
 GMX_LIBMD_EXPORT
 void init_forcerec(FILE              *fplog,
                    const output_env_t oenv,
diff --git a/include/network.h b/include/network.h
index 1cc018fdfd..6cfff06ac6 100644
--- a/include/network.h
+++ b/include/network.h
@@ -94,6 +94,7 @@ gmx_bool gmx_mpi_initialized(void);
  * when GROMACS was compiled without MPI support.
  */
 
+GMX_LIBGMX_EXPORT
 void gmx_barrier(const t_commrec *cr);
 /* Wait till all processes in cr->mpi_comm_mygroup have reached the barrier */
 
diff --git a/src/gmxlib/gmx_detect_hardware.c b/src/gmxlib/gmx_detect_hardware.c
index 2a3156eeaa..e918a78c21 100644
--- a/src/gmxlib/gmx_detect_hardware.c
+++ b/src/gmxlib/gmx_detect_hardware.c
@@ -374,19 +374,11 @@ void gmx_check_hw_runconf_consistency(FILE *fplog,
 
             same_count = gmx_count_gpu_dev_shared(&hw_opt->gpu_opt);
 
-            if (btMPI && same_count > 0)
-            {
-                gmx_fatal(FARGS,
-                          "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
-                          "Use MPI if you are sure that you want to assign a GPU to multiple threads.");
-            }
-
             if (same_count > 0)
             {
                 md_print_warn(cr, fplog,
                               "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
-                              "      multiple %s%s; this should be avoided as it can cause\n"
-                              "      performance loss.\n",
+                              "      multiple %s%s; this can cause performance loss.\n",
                               same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
             }
         }
diff --git a/src/kernel/pme_loadbal.c b/src/kernel/pme_loadbal.c
index d1a2f8efee..9700ceb147 100644
--- a/src/kernel/pme_loadbal.c
+++ b/src/kernel/pme_loadbal.c
@@ -669,6 +669,25 @@ gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
         nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
     {
         nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv, ic);
+
+        /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+         * also sharing texture references. To keep the code simple, we don't
+         * treat texture references as shared resources, but this means that
+         * the coulomb_tab texture ref will get updated by multiple threads.
+         * Hence, to ensure that the non-bonded kernels don't start before all
+         * texture binding operations are finished, we need to wait for all ranks
+         * to arrive here before continuing.
+         *
+         * Note that we could omit this barrier if GPUs are not shared (or
+         * texture objects are used), but as this is initialization code, there
+         * is not point in complicating things.
+         */
+#ifdef GMX_THREAD_MPI
+        if (PAR(cr))
+        {
+            gmx_barrier(cr);
+        }
+#endif /* GMX_THREAD_MPI */
     }
     else
     {
diff --git a/src/kernel/runner.c b/src/kernel/runner.c
index 3665c47406..42ef770eda 100644
--- a/src/kernel/runner.c
+++ b/src/kernel/runner.c
@@ -960,6 +960,48 @@ static void override_nsteps_cmdline(FILE            *fplog,
     }
 }
 
+/* Frees GPU memory and destroys the CUDA context.
+ *
+ * Note that this function needs to be called even if GPUs are not used
+ * in this run because the PME ranks have no knowledge of whether GPUs
+ * are used or not, but all ranks need to enter the barrier below.
+ */
+static void free_gpu_resources(FILE             *fplog,
+                               const t_forcerec *fr,
+                               const t_commrec  *cr)
+{
+    gmx_bool bIsPPrankUsingGPU;
+    char     gpu_err_str[STRLEN];
+
+    bIsPPrankUsingGPU = (cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU;
+
+    if (bIsPPrankUsingGPU)
+    {
+        /* free nbnxn data in GPU memory */
+        nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
+
+        /* With tMPI we need to wait for all ranks to finish deallocation before
+         * destroying the context in free_gpu() as some ranks may be sharing
+         * GPU and context.
+         * Note: as only PP ranks need to free GPU resources, so it is safe to
+         * not call the barrier on PME ranks.
+         */
+#ifdef GMX_THREAD_MPI
+        if (PAR(cr))
+        {
+            gmx_barrier(cr);
+        }
+#endif /* GMX_THREAD_MPI */
+
+        /* uninitialize GPU (by destroying the context) */
+        if (!free_gpu(gpu_err_str))
+        {
+            gmx_warning("On node %d failed to free GPU #%d: %s",
+                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+        }
+    }
+}
+
 int mdrunner(gmx_hw_opt_t *hw_opt,
              FILE *fplog, t_commrec *cr, int nfile,
              const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
@@ -1703,19 +1745,9 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
                nthreads_pp,
                EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
 
-    if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
-    {
-        char gpu_err_str[STRLEN];
-
-        /* free GPU memory and uninitialize GPU (by destroying the context) */
-        nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
 
-        if (!free_gpu(gpu_err_str))
-        {
-            gmx_warning("On node %d failed to free GPU #%d: %s",
-                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
-        }
-    }
+    /* Free GPU memory and context */
+    free_gpu_resources(fplog, fr, cr);
 
     if (opt2bSet("-membed", nfile, fnm))
     {
diff --git a/src/mdlib/forcerec.c b/src/mdlib/forcerec.c
index f7da120aca..6a1f9db586 100644
--- a/src/mdlib/forcerec.c
+++ b/src/mdlib/forcerec.c
@@ -1788,10 +1788,11 @@ void init_interaction_const_tables(FILE                *fp,
     }
 }
 
-void init_interaction_const(FILE                 *fp,
-                            interaction_const_t **interaction_const,
-                            const t_forcerec     *fr,
-                            real                  rtab)
+static void init_interaction_const(FILE                 *fp,
+                                   const t_commrec      *cr,
+                                   interaction_const_t **interaction_const,
+                                   const t_forcerec     *fr,
+                                   real                  rtab)
 {
     interaction_const_t *ic;
     gmx_bool             bUsesSimpleTables = TRUE;
@@ -1876,6 +1877,25 @@ void init_interaction_const(FILE                 *fp,
     if (fr->nbv != NULL && fr->nbv->bUseGPU)
     {
         nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv->grp);
+
+        /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+         * also sharing texture references. To keep the code simple, we don't
+         * treat texture references as shared resources, but this means that
+         * the coulomb_tab and nbfp texture refs will get updated by multiple threads.
+         * Hence, to ensure that the non-bonded kernels don't start before all
+         * texture binding operations are finished, we need to wait for all ranks
+         * to arrive here before continuing.
+         *
+         * Note that we could omit this barrier if GPUs are not shared (or
+         * texture objects are used), but as this is initialization code, there
+         * is not point in complicating things.
+         */
+#ifdef GMX_THREAD_MPI
+        if (PAR(cr))
+        {
+            gmx_barrier(cr);
+        }
+#endif /* GMX_THREAD_MPI */
     }
 
     bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
@@ -2905,7 +2925,8 @@ void init_forcerec(FILE              *fp,
     }
 
     /* fr->ic is used both by verlet and group kernels (to some extent) now */
-    init_interaction_const(fp, &fr->ic, fr, rtab);
+    init_interaction_const(fp, cr, &fr->ic, fr, rtab);
+
     if (ir->eDispCorr != edispcNO)
     {
         calc_enervirdiff(fp, ir->eDispCorr, fr);