From 8b644f19cacb59fe17d1876436a60f3d02839b8c Mon Sep 17 00:00:00 2001 From: Mark Abraham Date: Fri, 3 Nov 2017 02:33:01 +0100 Subject: [PATCH] Separate management of GPU contexts from modules Tasks from modules might share GPU contexts across either tasks, or thread-MPI ranks, so init and free operations can't be the responsibility of the modules themselves. Simplified the error reporting for init and free. Knowledge of the rank ID might help in diagnosing issues in some cases, but should (later) be the responsibility of a proper framework to catch errors during initialization across MPI ranks. Moved the GPU profiling cleanup back to where it was intended to be, before some earlier refactoring had left it somewhere not-quite-right. Change-Id: I682a1b1c7058cbebb41805dba05e688cbee18c2a --- src/gromacs/ewald/pme-gpu-internal.cpp | 11 +-- src/gromacs/ewald/pme-gpu-internal.h | 5 +- src/gromacs/ewald/pme.cpp | 4 +- src/gromacs/ewald/tests/pmetestcommon.cpp | 5 ++ src/gromacs/gpu_utils/gpu_utils.cu | 29 +++---- src/gromacs/gpu_utils/gpu_utils.h | 9 +-- src/gromacs/gpu_utils/gpu_utils_ocl.cpp | 1 - src/gromacs/mdlib/force.h | 3 +- src/gromacs/mdlib/forcerec.cpp | 87 ++++++++------------- src/gromacs/mdlib/forcerec.h | 28 +++---- src/gromacs/taskassignment/taskassignment.h | 7 ++ src/programs/mdrun/runner.cpp | 38 ++++++--- 12 files changed, 108 insertions(+), 119 deletions(-) diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp index 6e1d9992be..9487fd0c5d 100644 --- a/src/gromacs/ewald/pme-gpu-internal.cpp +++ b/src/gromacs/ewald/pme-gpu-internal.cpp @@ -282,11 +282,8 @@ static bool pme_gpu_check_restrictions(const gmx_pme_t *pme, std::string *error) * * \param[in,out] pme The PME structure. * \param[in,out] gpuInfo The GPU information structure. - * \param[in] mdlog The logger. - * \param[in] cr The communication structure. */ -static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, - const t_commrec *cr) +static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo) { std::string errorString; bool canRunOnGpu = pme_gpu_check_restrictions(pme, &errorString); @@ -308,8 +305,6 @@ static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx:: pme_gpu_set_testing(pmeGPU, false); - // GPU initialization - init_gpu(mdlog, cr->nodeid, gpuInfo); pmeGPU->deviceInfo = gpuInfo; pme_gpu_init_internal(pmeGPU); @@ -395,7 +390,7 @@ void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGPU, gmx::IVec *gridSize, gmx: } } -void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr) +void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo) { if (!pme_gpu_active(pme)) { @@ -405,7 +400,7 @@ void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLog if (!pme->gpu) { /* First-time initialization */ - pme_gpu_init(pme, gpuInfo, mdlog, cr); + pme_gpu_init(pme, gpuInfo); } else { diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h index be50ead6af..33c83103a8 100644 --- a/src/gromacs/ewald/pme-gpu-internal.h +++ b/src/gromacs/ewald/pme-gpu-internal.h @@ -55,7 +55,6 @@ struct gmx_hw_info_t; struct gmx_gpu_opt_t; struct gmx_pme_t; // only used in pme_gpu_reinit -struct t_commrec; struct gmx_wallclock_gpu_pme_t; struct pme_atomcomm_t; struct t_complex; @@ -636,11 +635,9 @@ void pme_gpu_get_real_grid_sizes(const PmeGpu *pmeGPU, gmx::IVec *gridSize, gmx: * * \param[in,out] pme The PME structure. * \param[in,out] gpuInfo The GPU information structure. - * \param[in] mdlog The logger. - * \param[in] cr The communication structure. * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs. */ -void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr); +void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo); /*! \libinternal \brief * Destroys the PME GPU data at the end of the run. diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp index 344d8553cc..c90254b0a3 100644 --- a/src/gromacs/ewald/pme.cpp +++ b/src/gromacs/ewald/pme.cpp @@ -520,7 +520,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec *cr, PmeRunMode runMode, PmeGpu *pmeGPU, gmx_device_info_t *gpuInfo, - const gmx::MDLogger &mdlog) + const gmx::MDLogger & /*mdlog*/) { int use_threads, sum_use_threads, i; ivec ndata; @@ -853,7 +853,7 @@ gmx_pme_t *gmx_pme_init(const t_commrec *cr, pme->lb_buf2 = nullptr; pme->lb_buf_nalloc = 0; - pme_gpu_reinit(pme.get(), gpuInfo, mdlog, cr); + pme_gpu_reinit(pme.get(), gpuInfo); pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx); diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp index f7c7b0f141..f9ea663ce6 100644 --- a/src/gromacs/ewald/tests/pmetestcommon.cpp +++ b/src/gromacs/ewald/tests/pmetestcommon.cpp @@ -52,6 +52,7 @@ #include "gromacs/ewald/pme-solve.h" #include "gromacs/ewald/pme-spread.h" #include "gromacs/fft/parallel_3dfft.h" +#include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/math/invertmatrix.h" #include "gromacs/mdtypes/commrec.h" #include "gromacs/pbcutil/pbc.h" @@ -103,6 +104,10 @@ static PmeSafePointer pmeInitInternal(const t_inputrec *inputRec, ) { const MDLogger dummyLogger; + if (gpuInfo) + { + init_gpu(dummyLogger, gpuInfo); + } const auto runMode = (mode == CodePath::CPU) ? PmeRunMode::CPU : PmeRunMode::GPU; t_commrec dummyCommrec = {0}; gmx_pme_t *pmeDataRaw = gmx_pme_init(&dummyCommrec, 1, 1, inputRec, atomCount, false, false, true, diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu index fb509f5d37..4e22007940 100644 --- a/src/gromacs/gpu_utils/gpu_utils.cu +++ b/src/gromacs/gpu_utils/gpu_utils.cu @@ -55,11 +55,13 @@ #include "gromacs/hardware/gpu_hw_info.h" #include "gromacs/utility/basedefinitions.h" #include "gromacs/utility/cstringutil.h" +#include "gromacs/utility/fatalerror.h" #include "gromacs/utility/gmxassert.h" #include "gromacs/utility/logger.h" #include "gromacs/utility/programcontext.h" #include "gromacs/utility/smalloc.h" #include "gromacs/utility/snprintf.h" +#include "gromacs/utility/stringutil.h" #if HAVE_NVML #include @@ -507,20 +509,18 @@ static gmx_bool reset_gpu_application_clocks(const gmx_device_info_t gmx_unused #endif /* HAVE_NVML_APPLICATION_CLOCKS */ } -void init_gpu(const gmx::MDLogger &mdlog, int rank, - gmx_device_info_t *deviceInfo) +void init_gpu(const gmx::MDLogger &mdlog, + gmx_device_info_t *deviceInfo) { cudaError_t stat; - char sbuf[STRLEN]; assert(deviceInfo); stat = cudaSetDevice(deviceInfo->id); if (stat != cudaSuccess) { - snprintf(sbuf, STRLEN, "On rank %d failed to initialize GPU #%d", - rank, deviceInfo->id); - CU_RET_ERR(stat, sbuf); + auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id); + CU_RET_ERR(stat, message.c_str()); } if (debug) @@ -534,13 +534,9 @@ void init_gpu(const gmx::MDLogger &mdlog, int rank, init_gpu_application_clocks(mdlog, deviceInfo); } -gmx_bool free_cuda_gpu(const gmx_device_info_t *deviceInfo, - char *result_str) +void free_gpu(const gmx_device_info_t *deviceInfo) { cudaError_t stat; - gmx_bool reset_gpu_application_clocks_status = true; - - assert(result_str); if (debug) { @@ -552,12 +548,17 @@ gmx_bool free_cuda_gpu(const gmx_device_info_t *deviceInfo, if (deviceInfo != nullptr) { - reset_gpu_application_clocks_status = reset_gpu_application_clocks(deviceInfo); + if (!reset_gpu_application_clocks(deviceInfo)) + { + gmx_warning("Failed to reset GPU application clocks on GPU #%d", deviceInfo->id); + } } stat = cudaDeviceReset(); - strncpy(result_str, cudaGetErrorString(stat), STRLEN); - return (stat == cudaSuccess) && reset_gpu_application_clocks_status; + if (stat != cudaSuccess) + { + gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat)); + } } gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info, diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h index 8f59630462..b1cc07b51f 100644 --- a/src/gromacs/gpu_utils/gpu_utils.h +++ b/src/gromacs/gpu_utils/gpu_utils.h @@ -114,7 +114,6 @@ void free_gpu_info(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU * the patterns here are the same as elsewhere in this header. * * param[in] mdlog log file to write to - * param[in] rank MPI rank of this process (for error output) * \param[inout] deviceInfo device info of the GPU to initialize * * Issues a fatal error for any critical errors that occur during @@ -122,7 +121,6 @@ void free_gpu_info(const struct gmx_gpu_info_t *GPU_FUNC_ARGUMENT(gpu_info)) GPU */ GPU_FUNC_QUALIFIER void init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog), - int GPU_FUNC_ARGUMENT(rank), gmx_device_info_t *GPU_FUNC_ARGUMENT(deviceInfo)) GPU_FUNC_TERM /*! \brief Frees up the CUDA GPU used by the active context at the time of calling. @@ -130,15 +128,14 @@ void init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog), * The context is explicitly destroyed and therefore all data uploaded to the GPU * is lost. This should only be called when none of this data is required anymore. * + * Calls gmx_warning upon errors. + * * \param[in] deviceInfo device info of the GPU to clean up for - * \param[out] result_str the message related to the error that occurred - * during the initialization (if there was any). * * \returns true if no error occurs during the freeing. */ CUDA_FUNC_QUALIFIER -gmx_bool free_cuda_gpu(const gmx_device_info_t *CUDA_FUNC_ARGUMENT(deviceInfo), - char *CUDA_FUNC_ARGUMENT(result_str)) CUDA_FUNC_TERM_WITH_RETURN(TRUE) +void free_gpu(const gmx_device_info_t *CUDA_FUNC_ARGUMENT(deviceInfo)) CUDA_FUNC_TERM /*! \brief Return a pointer to the device info for \c deviceId * diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp index 5195baccc5..c121f813b3 100644 --- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp +++ b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp @@ -401,7 +401,6 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int ind //! This function is documented in the header file void init_gpu(const gmx::MDLogger & /*mdlog*/, - int /* rank */, gmx_device_info_t *deviceInfo) { assert(deviceInfo); diff --git a/src/gromacs/mdlib/force.h b/src/gromacs/mdlib/force.h index 87d4fe9496..2d83e17a25 100644 --- a/src/gromacs/mdlib/force.h +++ b/src/gromacs/mdlib/force.h @@ -216,7 +216,6 @@ void do_force_lowlevel(t_forcerec *fr, /* Call all the force routines */ void free_gpu_resources(const t_forcerec *fr, - const t_commrec *cr, - const gmx_device_info_t *deviceInfo); + const t_commrec *cr); #endif diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp index 05c59c2f29..0d9cfb49c7 100644 --- a/src/gromacs/mdlib/forcerec.cpp +++ b/src/gromacs/mdlib/forcerec.cpp @@ -2134,19 +2134,15 @@ init_interaction_const(FILE *fp, *interaction_const = ic; } -/* TODO deviceInfo should be logically const, but currently - * init_gpu modifies it to set up NVML support. This could - * happen during the detection phase, and deviceInfo could - * the become const. */ -static void init_nb_verlet(const gmx::MDLogger &mdlog, - nonbonded_verlet_t **nb_verlet, - gmx_bool bFEP_NonBonded, - const t_inputrec *ir, - const t_forcerec *fr, - const t_commrec *cr, - gmx_device_info_t *deviceInfo, - const gmx_mtop_t *mtop, - matrix box) +static void init_nb_verlet(const gmx::MDLogger &mdlog, + nonbonded_verlet_t **nb_verlet, + gmx_bool bFEP_NonBonded, + const t_inputrec *ir, + const t_forcerec *fr, + const t_commrec *cr, + const gmx_device_info_t *deviceInfo, + const gmx_mtop_t *mtop, + matrix box) { nonbonded_verlet_t *nbv; char *env; @@ -2161,12 +2157,6 @@ static void init_nb_verlet(const gmx::MDLogger &mdlog, GMX_RELEASE_ASSERT(!(nbv->emulateGpu == EmulateGpuNonbonded::Yes && nbv->bUseGPU), "When GPU emulation is active, there cannot be a GPU assignment"); - if (nbv->bUseGPU) - { - /* Use the assigned GPU. */ - init_gpu(mdlog, cr->nodeid, deviceInfo); - } - nbv->nbs = nullptr; nbv->min_ci_balanced = 0; @@ -2319,20 +2309,20 @@ gmx_bool usingGpu(nonbonded_verlet_t *nbv) return nbv != nullptr && nbv->bUseGPU; } -void init_forcerec(FILE *fp, - const gmx::MDLogger &mdlog, - t_forcerec *fr, - t_fcdata *fcd, - const t_inputrec *ir, - const gmx_mtop_t *mtop, - const t_commrec *cr, - matrix box, - const char *tabfn, - const char *tabpfn, - const t_filenm *tabbfnm, - gmx_device_info_t *deviceInfo, - gmx_bool bNoSolvOpt, - real print_force) +void init_forcerec(FILE *fp, + const gmx::MDLogger &mdlog, + t_forcerec *fr, + t_fcdata *fcd, + const t_inputrec *ir, + const gmx_mtop_t *mtop, + const t_commrec *cr, + matrix box, + const char *tabfn, + const char *tabpfn, + const t_filenm *tabbfnm, + const gmx_device_info_t *deviceInfo, + gmx_bool bNoSolvOpt, + real print_force) { int i, m, negp_pp, negptable, egi, egj; real rtab; @@ -3156,27 +3146,26 @@ void init_forcerec(FILE *fp, } } -/* Frees GPU memory and destroys the GPU context. +/* Frees GPU memory and sets a tMPI node barrier. * * Note that this function needs to be called even if GPUs are not used * in this run because the PME ranks have no knowledge of whether GPUs * are used or not, but all ranks need to enter the barrier below. + * \todo Remove physical node barrier from this function after making sure + * that it's not needed anymore (with a shared GPU run). */ void free_gpu_resources(const t_forcerec *fr, - const t_commrec *cr, - const gmx_device_info_t *deviceInfo) + const t_commrec *cr) { - gmx_bool bIsPPrankUsingGPU; - char gpu_err_str[STRLEN]; + bool isPPrankUsingGPU = fr && fr->nbv && fr->nbv->bUseGPU; - bIsPPrankUsingGPU = thisRankHasDuty(cr, DUTY_PP) && fr && fr->nbv && fr->nbv->bUseGPU; + /* stop the GPU profiler (only CUDA) */ + stopGpuProfiler(); - if (bIsPPrankUsingGPU) + if (isPPrankUsingGPU) { /* free nbnxn data in GPU memory */ nbnxn_gpu_free(fr->nbv->gpu_nbv); - /* stop the GPU profiler (only CUDA) */ - stopGpuProfiler(); } /* With tMPI we need to wait for all ranks to finish deallocation before @@ -3189,20 +3178,8 @@ void free_gpu_resources(const t_forcerec *fr, * Note: it is safe to not call the barrier on the ranks which do not use GPU, * but it is easier and more futureproof to call it on the whole node. */ -#if GMX_THREAD_MPI - if (PAR(cr) || MULTISIM(cr)) + if (GMX_THREAD_MPI && (PAR(cr) || MULTISIM(cr))) { gmx_barrier_physical_node(cr); } -#endif /* GMX_THREAD_MPI */ - - if (bIsPPrankUsingGPU) - { - /* uninitialize GPU (by destroying the context) */ - if (!free_cuda_gpu(deviceInfo, gpu_err_str)) - { - gmx_warning("On rank %d failed to free GPU #%d: %s", - cr->nodeid, get_current_cuda_gpu_device_id(), gpu_err_str); - } - } } diff --git a/src/gromacs/mdlib/forcerec.h b/src/gromacs/mdlib/forcerec.h index 5dd6a73fca..9906a755a1 100644 --- a/src/gromacs/mdlib/forcerec.h +++ b/src/gromacs/mdlib/forcerec.h @@ -111,20 +111,20 @@ void init_interaction_const_tables(FILE *fp, * \param[in] bNoSolvOpt Do not use solvent optimization * \param[in] print_force Print forces for atoms with force >= print_force */ -void init_forcerec(FILE *fplog, - const gmx::MDLogger &mdlog, - t_forcerec *fr, - t_fcdata *fcd, - const t_inputrec *ir, - const gmx_mtop_t *mtop, - const t_commrec *cr, - matrix box, - const char *tabfn, - const char *tabpfn, - const t_filenm *tabbfnm, - gmx_device_info_t *deviceInfo, - gmx_bool bNoSolvOpt, - real print_force); +void init_forcerec(FILE *fplog, + const gmx::MDLogger &mdlog, + t_forcerec *fr, + t_fcdata *fcd, + const t_inputrec *ir, + const gmx_mtop_t *mtop, + const t_commrec *cr, + matrix box, + const char *tabfn, + const char *tabpfn, + const t_filenm *tabbfnm, + const gmx_device_info_t *deviceInfo, + gmx_bool bNoSolvOpt, + real print_force); /*! \brief Divide exclusions over threads * diff --git a/src/gromacs/taskassignment/taskassignment.h b/src/gromacs/taskassignment/taskassignment.h index 294230543e..62123e0824 100644 --- a/src/gromacs/taskassignment/taskassignment.h +++ b/src/gromacs/taskassignment/taskassignment.h @@ -118,6 +118,13 @@ runTaskAssignment(const std::vector &gpuIdsToUse, const t_commrec *cr, const std::vector &gpuTasksOnThisRank); +//! Function for whether the task of \c mapping has value \c TaskType. +template +bool hasTaskType(const GpuTaskMapping &mapping) +{ + return mapping.task_ == TaskType; +} + } // namespace #endif diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp index 53c55ff09f..b266ecb415 100644 --- a/src/programs/mdrun/runner.cpp +++ b/src/programs/mdrun/runner.cpp @@ -84,6 +84,7 @@ #include "gromacs/mdlib/mdrun.h" #include "gromacs/mdlib/minimize.h" #include "gromacs/mdlib/nb_verlet.h" +#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h" #include "gromacs/mdlib/nbnxn_search.h" #include "gromacs/mdlib/nbnxn_tuning.h" #include "gromacs/mdlib/qmmm.h" @@ -1009,22 +1010,25 @@ int Mdrunner::mdrunner() cr, mdlog); gmx_device_info_t *nonbondedDeviceInfo = nullptr; - int nonbondedDeviceId = -1; + if (thisRankHasDuty(cr, DUTY_PP)) { - if (!gpuTaskAssignment.empty()) + // This works because only one task of each type is currently permitted. + auto nbGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(), + hasTaskType); + if (nbGpuTaskMapping != gpuTaskAssignment.end()) { - GMX_RELEASE_ASSERT(gpuTaskAssignment.size() == 1, "A valid GPU assignment can only have one task per rank"); - GMX_RELEASE_ASSERT(gpuTaskAssignment[0].task_ == gmx::GpuTask::Nonbonded, "A valid GPU assignment can only include short-ranged tasks"); - nonbondedDeviceId = gpuTaskAssignment[0].deviceId_; + int nonbondedDeviceId = nbGpuTaskMapping->deviceId_; nonbondedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, nonbondedDeviceId); - } - } + init_gpu(mdlog, nonbondedDeviceInfo); - if (DOMAINDECOMP(cr)) - { - /* When we share GPUs over ranks, we need to know this for the DLB */ - dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId); + if (DOMAINDECOMP(cr)) + { + /* When we share GPUs over ranks, we need to know this for the DLB */ + dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId); + } + + } } /* getting number of PP/PME threads @@ -1306,8 +1310,16 @@ int Mdrunner::mdrunner() pmedata = nullptr; } - /* Free GPU memory and context */ - free_gpu_resources(fr, cr, nonbondedDeviceInfo); + // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x, + // before we destroy the GPU context(s) in free_gpu_resources(). + // Pinned buffers are associated with contexts in CUDA. + // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go. + mdAtoms.reset(nullptr); + globalState.reset(nullptr); + + /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */ + free_gpu_resources(fr, cr); + free_gpu(nonbondedDeviceInfo); if (doMembed) { -- 2.22.0