From: Szilard Pall Date: Fri, 4 Oct 2013 00:01:44 +0000 (+0200) Subject: enable GPU sharing among tMPI ranks X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=726a885cae86bd7598f9abdbccce02948435ed2f;p=alexxy%2Fgromacs.git enable GPU sharing among tMPI ranks It turns out that the only issue preventing sharing GPUs among thread-MPI threads was that when the thread arriving to free_gpu() first destroys the context, it is highly likely that the other thread(s) sharing a GPU with this are still freeing their resources - operation which fails as soon as the context is destroyed by the "fast" thread. Simply placing a barrier between the GPU resource freeing and context destruction solves the issue. However, there is still a very unlikely concurrency hazard after CUDA texture reference updates (non-bonded parameter table and coulomb force table initialization). To be on the safe side, with tMPI a barrier is placed after these operations. Change-Id: Iac7a39f841ca31a32ab979ee0012cfc18a811d76 --- diff --git a/include/force.h b/include/force.h index 9dc45af3a3..cc695c6be4 100644 --- a/include/force.h +++ b/include/force.h @@ -160,14 +160,6 @@ void init_interaction_const_tables(FILE *fp, * use with group kernels. */ -void init_interaction_const(FILE *fp, - interaction_const_t **interaction_const, - const t_forcerec *fr, - real rtab); -/* Initializes the interaction constant data structure. Currently it - * uses forcerec as input. - */ - GMX_LIBMD_EXPORT void init_forcerec(FILE *fplog, const output_env_t oenv, diff --git a/include/network.h b/include/network.h index 1cc018fdfd..6cfff06ac6 100644 --- a/include/network.h +++ b/include/network.h @@ -94,6 +94,7 @@ gmx_bool gmx_mpi_initialized(void); * when GROMACS was compiled without MPI support. */ +GMX_LIBGMX_EXPORT void gmx_barrier(const t_commrec *cr); /* Wait till all processes in cr->mpi_comm_mygroup have reached the barrier */ diff --git a/src/gmxlib/gmx_detect_hardware.c b/src/gmxlib/gmx_detect_hardware.c index 2a3156eeaa..e918a78c21 100644 --- a/src/gmxlib/gmx_detect_hardware.c +++ b/src/gmxlib/gmx_detect_hardware.c @@ -374,19 +374,11 @@ void gmx_check_hw_runconf_consistency(FILE *fplog, same_count = gmx_count_gpu_dev_shared(&hw_opt->gpu_opt); - if (btMPI && same_count > 0) - { - gmx_fatal(FARGS, - "Invalid GPU assignment: can't share a GPU among multiple thread-MPI threads.\n" - "Use MPI if you are sure that you want to assign a GPU to multiple threads."); - } - if (same_count > 0) { md_print_warn(cr, fplog, "NOTE: Potentially sub-optimal launch configuration: you assigned %s to\n" - " multiple %s%s; this should be avoided as it can cause\n" - " performance loss.\n", + " multiple %s%s; this can cause performance loss.\n", same_count > 1 ? "GPUs" : "a GPU", th_or_proc, btMPI ? "s" : "es"); } } diff --git a/src/kernel/pme_loadbal.c b/src/kernel/pme_loadbal.c index d1a2f8efee..9700ceb147 100644 --- a/src/kernel/pme_loadbal.c +++ b/src/kernel/pme_loadbal.c @@ -669,6 +669,25 @@ gmx_bool pme_load_balance(pme_load_balancing_t pme_lb, nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA) { nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv, ic); + + /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore + * also sharing texture references. To keep the code simple, we don't + * treat texture references as shared resources, but this means that + * the coulomb_tab texture ref will get updated by multiple threads. + * Hence, to ensure that the non-bonded kernels don't start before all + * texture binding operations are finished, we need to wait for all ranks + * to arrive here before continuing. + * + * Note that we could omit this barrier if GPUs are not shared (or + * texture objects are used), but as this is initialization code, there + * is not point in complicating things. + */ +#ifdef GMX_THREAD_MPI + if (PAR(cr)) + { + gmx_barrier(cr); + } +#endif /* GMX_THREAD_MPI */ } else { diff --git a/src/kernel/runner.c b/src/kernel/runner.c index 3665c47406..42ef770eda 100644 --- a/src/kernel/runner.c +++ b/src/kernel/runner.c @@ -960,6 +960,48 @@ static void override_nsteps_cmdline(FILE *fplog, } } +/* Frees GPU memory and destroys the CUDA context. + * + * Note that this function needs to be called even if GPUs are not used + * in this run because the PME ranks have no knowledge of whether GPUs + * are used or not, but all ranks need to enter the barrier below. + */ +static void free_gpu_resources(FILE *fplog, + const t_forcerec *fr, + const t_commrec *cr) +{ + gmx_bool bIsPPrankUsingGPU; + char gpu_err_str[STRLEN]; + + bIsPPrankUsingGPU = (cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU; + + if (bIsPPrankUsingGPU) + { + /* free nbnxn data in GPU memory */ + nbnxn_cuda_free(fplog, fr->nbv->cu_nbv); + + /* With tMPI we need to wait for all ranks to finish deallocation before + * destroying the context in free_gpu() as some ranks may be sharing + * GPU and context. + * Note: as only PP ranks need to free GPU resources, so it is safe to + * not call the barrier on PME ranks. + */ +#ifdef GMX_THREAD_MPI + if (PAR(cr)) + { + gmx_barrier(cr); + } +#endif /* GMX_THREAD_MPI */ + + /* uninitialize GPU (by destroying the context) */ + if (!free_gpu(gpu_err_str)) + { + gmx_warning("On node %d failed to free GPU #%d: %s", + cr->nodeid, get_current_gpu_device_id(), gpu_err_str); + } + } +} + int mdrunner(gmx_hw_opt_t *hw_opt, FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose, @@ -1703,19 +1745,9 @@ int mdrunner(gmx_hw_opt_t *hw_opt, nthreads_pp, EI_DYNAMICS(inputrec->eI) && !MULTISIM(cr)); - if ((cr->duty & DUTY_PP) && fr->nbv != NULL && fr->nbv->bUseGPU) - { - char gpu_err_str[STRLEN]; - - /* free GPU memory and uninitialize GPU (by destroying the context) */ - nbnxn_cuda_free(fplog, fr->nbv->cu_nbv); - if (!free_gpu(gpu_err_str)) - { - gmx_warning("On node %d failed to free GPU #%d: %s", - cr->nodeid, get_current_gpu_device_id(), gpu_err_str); - } - } + /* Free GPU memory and context */ + free_gpu_resources(fplog, fr, cr); if (opt2bSet("-membed", nfile, fnm)) { diff --git a/src/mdlib/forcerec.c b/src/mdlib/forcerec.c index f7da120aca..6a1f9db586 100644 --- a/src/mdlib/forcerec.c +++ b/src/mdlib/forcerec.c @@ -1788,10 +1788,11 @@ void init_interaction_const_tables(FILE *fp, } } -void init_interaction_const(FILE *fp, - interaction_const_t **interaction_const, - const t_forcerec *fr, - real rtab) +static void init_interaction_const(FILE *fp, + const t_commrec *cr, + interaction_const_t **interaction_const, + const t_forcerec *fr, + real rtab) { interaction_const_t *ic; gmx_bool bUsesSimpleTables = TRUE; @@ -1876,6 +1877,25 @@ void init_interaction_const(FILE *fp, if (fr->nbv != NULL && fr->nbv->bUseGPU) { nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv->grp); + + /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore + * also sharing texture references. To keep the code simple, we don't + * treat texture references as shared resources, but this means that + * the coulomb_tab and nbfp texture refs will get updated by multiple threads. + * Hence, to ensure that the non-bonded kernels don't start before all + * texture binding operations are finished, we need to wait for all ranks + * to arrive here before continuing. + * + * Note that we could omit this barrier if GPUs are not shared (or + * texture objects are used), but as this is initialization code, there + * is not point in complicating things. + */ +#ifdef GMX_THREAD_MPI + if (PAR(cr)) + { + gmx_barrier(cr); + } +#endif /* GMX_THREAD_MPI */ } bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1); @@ -2905,7 +2925,8 @@ void init_forcerec(FILE *fp, } /* fr->ic is used both by verlet and group kernels (to some extent) now */ - init_interaction_const(fp, &fr->ic, fr, rtab); + init_interaction_const(fp, cr, &fr->ic, fr, rtab); + if (ir->eDispCorr != edispcNO) { calc_enervirdiff(fp, ir->eDispCorr, fr);