* use with group kernels.
*/
-void init_interaction_const(FILE *fp,
- interaction_const_t **interaction_const,
- const t_forcerec *fr,
- real rtab);
-/* Initializes the interaction constant data structure. Currently it
- * uses forcerec as input.
- */
-
GMX_LIBMD_EXPORT
void init_forcerec(FILE *fplog,
const output_env_t oenv,
* when GROMACS was compiled without MPI support.
*/
+GMX_LIBGMX_EXPORT
void gmx_barrier(const t_commrec *cr);
/* Wait till all processes in cr->mpi_comm_mygroup have reached the barrier */
same_count = gmx_count_gpu_dev_shared(&hw_opt->gpu_opt);
- if (btMPI && same_count > 0)
- {
- gmx_fatal(FARGS,
- "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n"
- "Use MPI if you are sure that you want to assign a GPU to multiple threads.");
- }
-
if (same_count > 0)
{
md_print_warn(cr, fplog,
"NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n"
- " multiple %s%s; this should be avoided as it can cause\n"
- " performance loss.\n",
+ " multiple %s%s; this can cause performance loss.\n",
same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es");
}
}
nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
{
nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv, ic);
+
+ /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+ * also sharing texture references. To keep the code simple, we don't
+ * treat texture references as shared resources, but this means that
+ * the coulomb_tab texture ref will get updated by multiple threads.
+ * Hence, to ensure that the non-bonded kernels don't start before all
+ * texture binding operations are finished, we need to wait for all ranks
+ * to arrive here before continuing.
+ *
+ * Note that we could omit this barrier if GPUs are not shared (or
+ * texture objects are used), but as this is initialization code, there
+ * is not point in complicating things.
+ */
+#ifdef GMX_THREAD_MPI
+ if (PAR(cr))
+ {
+ gmx_barrier(cr);
+ }
+#endif /* GMX_THREAD_MPI */
}
else
{
}
}
+/* Frees GPU memory and destroys the CUDA context.
+ *
+ * Note that this function needs to be called even if GPUs are not used
+ * in this run because the PME ranks have no knowledge of whether GPUs
+ * are used or not, but all ranks need to enter the barrier below.
+ */
+static void free_gpu_resources(FILE *fplog,
+ const t_forcerec *fr,
+ const t_commrec *cr)
+{
+ gmx_bool bIsPPrankUsingGPU;
+ char gpu_err_str[STRLEN];
+
+ bIsPPrankUsingGPU = (cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU;
+
+ if (bIsPPrankUsingGPU)
+ {
+ /* free nbnxn data in GPU memory */
+ nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
+
+ /* With tMPI we need to wait for all ranks to finish deallocation before
+ * destroying the context in free_gpu() as some ranks may be sharing
+ * GPU and context.
+ * Note: as only PP ranks need to free GPU resources, so it is safe to
+ * not call the barrier on PME ranks.
+ */
+#ifdef GMX_THREAD_MPI
+ if (PAR(cr))
+ {
+ gmx_barrier(cr);
+ }
+#endif /* GMX_THREAD_MPI */
+
+ /* uninitialize GPU (by destroying the context) */
+ if (!free_gpu(gpu_err_str))
+ {
+ gmx_warning("On node %d failed to free GPU #%d: %s",
+ cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+ }
+ }
+}
+
int mdrunner(gmx_hw_opt_t *hw_opt,
FILE *fplog, t_commrec *cr, int nfile,
const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
nthreads_pp,
EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr));
- if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU)
- {
- char gpu_err_str[STRLEN];
-
- /* free GPU memory and uninitialize GPU (by destroying the context) */
- nbnxn_cuda_free(fplog, fr->nbv->cu_nbv);
- if (!free_gpu(gpu_err_str))
- {
- gmx_warning("On node %d failed to free GPU #%d: %s",
- cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
- }
- }
+ /* Free GPU memory and context */
+ free_gpu_resources(fplog, fr, cr);
if (opt2bSet("-membed", nfile, fnm))
{
}
}
-void init_interaction_const(FILE *fp,
- interaction_const_t **interaction_const,
- const t_forcerec *fr,
- real rtab)
+static void init_interaction_const(FILE *fp,
+ const t_commrec *cr,
+ interaction_const_t **interaction_const,
+ const t_forcerec *fr,
+ real rtab)
{
interaction_const_t *ic;
gmx_bool bUsesSimpleTables = TRUE;
if (fr->nbv != NULL && fr->nbv->bUseGPU)
{
nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv->grp);
+
+ /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+ * also sharing texture references. To keep the code simple, we don't
+ * treat texture references as shared resources, but this means that
+ * the coulomb_tab and nbfp texture refs will get updated by multiple threads.
+ * Hence, to ensure that the non-bonded kernels don't start before all
+ * texture binding operations are finished, we need to wait for all ranks
+ * to arrive here before continuing.
+ *
+ * Note that we could omit this barrier if GPUs are not shared (or
+ * texture objects are used), but as this is initialization code, there
+ * is not point in complicating things.
+ */
+#ifdef GMX_THREAD_MPI
+ if (PAR(cr))
+ {
+ gmx_barrier(cr);
+ }
+#endif /* GMX_THREAD_MPI */
}
bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
}
/* fr->ic is used both by verlet and group kernels (to some extent) now */
- init_interaction_const(fp, &fr->ic, fr, rtab);
+ init_interaction_const(fp, cr, &fr->ic, fr, rtab);
+
if (ir->eDispCorr != edispcNO)
{
calc_enervirdiff(fp, ir->eDispCorr, fr);