*
* \param[in,out] pme The PME structure.
* \param[in,out] gpuInfo The GPU information structure.
- * \param[in] mdlog The logger.
- * \param[in] cr The communication structure.
*/
-static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog,
- const t_commrec *cr)
+static void pme_gpu_init(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
{
std::string errorString;
bool canRunOnGpu = pme_gpu_check_restrictions(pme, &errorString);
pme_gpu_set_testing(pmeGPU, false);
- // GPU initialization
- init_gpu(mdlog, cr->nodeid, gpuInfo);
pmeGPU->deviceInfo = gpuInfo;
pme_gpu_init_internal(pmeGPU);
}
}
-void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr)
+void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo)
{
if (!pme_gpu_active(pme))
{
if (!pme->gpu)
{
/* First-time initialization */
- pme_gpu_init(pme, gpuInfo, mdlog, cr);
+ pme_gpu_init(pme, gpuInfo);
}
else
{
struct gmx_hw_info_t;
struct gmx_gpu_opt_t;
struct gmx_pme_t; // only used in pme_gpu_reinit
-struct t_commrec;
struct gmx_wallclock_gpu_pme_t;
struct pme_atomcomm_t;
struct t_complex;
*
* \param[in,out] pme The PME structure.
* \param[in,out] gpuInfo The GPU information structure.
- * \param[in] mdlog The logger.
- * \param[in] cr The communication structure.
* \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
*/
-void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo, const gmx::MDLogger &mdlog, const t_commrec *cr);
+void pme_gpu_reinit(gmx_pme_t *pme, gmx_device_info_t *gpuInfo);
/*! \libinternal \brief
* Destroys the PME GPU data at the end of the run.
PmeRunMode runMode,
PmeGpu *pmeGPU,
gmx_device_info_t *gpuInfo,
- const gmx::MDLogger &mdlog)
+ const gmx::MDLogger & /*mdlog*/)
{
int use_threads, sum_use_threads, i;
ivec ndata;
pme->lb_buf2 = nullptr;
pme->lb_buf_nalloc = 0;
- pme_gpu_reinit(pme.get(), gpuInfo, mdlog, cr);
+ pme_gpu_reinit(pme.get(), gpuInfo);
pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
#include "gromacs/ewald/pme-solve.h"
#include "gromacs/ewald/pme-spread.h"
#include "gromacs/fft/parallel_3dfft.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/math/invertmatrix.h"
#include "gromacs/mdtypes/commrec.h"
#include "gromacs/pbcutil/pbc.h"
)
{
const MDLogger dummyLogger;
+ if (gpuInfo)
+ {
+ init_gpu(dummyLogger, gpuInfo);
+ }
const auto runMode = (mode == CodePath::CPU) ? PmeRunMode::CPU : PmeRunMode::GPU;
t_commrec dummyCommrec = {0};
gmx_pme_t *pmeDataRaw = gmx_pme_init(&dummyCommrec, 1, 1, inputRec, atomCount, false, false, true,
#include "gromacs/hardware/gpu_hw_info.h"
#include "gromacs/utility/basedefinitions.h"
#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/logger.h"
#include "gromacs/utility/programcontext.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/snprintf.h"
+#include "gromacs/utility/stringutil.h"
#if HAVE_NVML
#include <nvml.h>
#endif /* HAVE_NVML_APPLICATION_CLOCKS */
}
-void init_gpu(const gmx::MDLogger &mdlog, int rank,
- gmx_device_info_t *deviceInfo)
+void init_gpu(const gmx::MDLogger &mdlog,
+ gmx_device_info_t *deviceInfo)
{
cudaError_t stat;
- char sbuf[STRLEN];
assert(deviceInfo);
stat = cudaSetDevice(deviceInfo->id);
if (stat != cudaSuccess)
{
- snprintf(sbuf, STRLEN, "On rank %d failed to initialize GPU #%d",
- rank, deviceInfo->id);
- CU_RET_ERR(stat, sbuf);
+ auto message = gmx::formatString("Failed to initialize GPU #%d", deviceInfo->id);
+ CU_RET_ERR(stat, message.c_str());
}
if (debug)
init_gpu_application_clocks(mdlog, deviceInfo);
}
-gmx_bool free_cuda_gpu(const gmx_device_info_t *deviceInfo,
- char *result_str)
+void free_gpu(const gmx_device_info_t *deviceInfo)
{
cudaError_t stat;
- gmx_bool reset_gpu_application_clocks_status = true;
-
- assert(result_str);
if (debug)
{
if (deviceInfo != nullptr)
{
- reset_gpu_application_clocks_status = reset_gpu_application_clocks(deviceInfo);
+ if (!reset_gpu_application_clocks(deviceInfo))
+ {
+ gmx_warning("Failed to reset GPU application clocks on GPU #%d", deviceInfo->id);
+ }
}
stat = cudaDeviceReset();
- strncpy(result_str, cudaGetErrorString(stat), STRLEN);
- return (stat == cudaSuccess) && reset_gpu_application_clocks_status;
+ if (stat != cudaSuccess)
+ {
+ gmx_warning("Failed to free GPU #%d: %s", deviceInfo->id, cudaGetErrorString(stat));
+ }
}
gmx_device_info_t *getDeviceInfo(const gmx_gpu_info_t &gpu_info,
* the patterns here are the same as elsewhere in this header.
*
* param[in] mdlog log file to write to
- * param[in] rank MPI rank of this process (for error output)
* \param[inout] deviceInfo device info of the GPU to initialize
*
* Issues a fatal error for any critical errors that occur during
*/
GPU_FUNC_QUALIFIER
void init_gpu(const gmx::MDLogger &GPU_FUNC_ARGUMENT(mdlog),
- int GPU_FUNC_ARGUMENT(rank),
gmx_device_info_t *GPU_FUNC_ARGUMENT(deviceInfo)) GPU_FUNC_TERM
/*! \brief Frees up the CUDA GPU used by the active context at the time of calling.
* The context is explicitly destroyed and therefore all data uploaded to the GPU
* is lost. This should only be called when none of this data is required anymore.
*
+ * Calls gmx_warning upon errors.
+ *
* \param[in] deviceInfo device info of the GPU to clean up for
- * \param[out] result_str the message related to the error that occurred
- * during the initialization (if there was any).
*
* \returns true if no error occurs during the freeing.
*/
CUDA_FUNC_QUALIFIER
-gmx_bool free_cuda_gpu(const gmx_device_info_t *CUDA_FUNC_ARGUMENT(deviceInfo),
- char *CUDA_FUNC_ARGUMENT(result_str)) CUDA_FUNC_TERM_WITH_RETURN(TRUE)
+void free_gpu(const gmx_device_info_t *CUDA_FUNC_ARGUMENT(deviceInfo)) CUDA_FUNC_TERM
/*! \brief Return a pointer to the device info for \c deviceId
*
//! This function is documented in the header file
void init_gpu(const gmx::MDLogger & /*mdlog*/,
- int /* rank */,
gmx_device_info_t *deviceInfo)
{
assert(deviceInfo);
/* Call all the force routines */
void free_gpu_resources(const t_forcerec *fr,
- const t_commrec *cr,
- const gmx_device_info_t *deviceInfo);
+ const t_commrec *cr);
#endif
*interaction_const = ic;
}
-/* TODO deviceInfo should be logically const, but currently
- * init_gpu modifies it to set up NVML support. This could
- * happen during the detection phase, and deviceInfo could
- * the become const. */
-static void init_nb_verlet(const gmx::MDLogger &mdlog,
- nonbonded_verlet_t **nb_verlet,
- gmx_bool bFEP_NonBonded,
- const t_inputrec *ir,
- const t_forcerec *fr,
- const t_commrec *cr,
- gmx_device_info_t *deviceInfo,
- const gmx_mtop_t *mtop,
- matrix box)
+static void init_nb_verlet(const gmx::MDLogger &mdlog,
+ nonbonded_verlet_t **nb_verlet,
+ gmx_bool bFEP_NonBonded,
+ const t_inputrec *ir,
+ const t_forcerec *fr,
+ const t_commrec *cr,
+ const gmx_device_info_t *deviceInfo,
+ const gmx_mtop_t *mtop,
+ matrix box)
{
nonbonded_verlet_t *nbv;
char *env;
GMX_RELEASE_ASSERT(!(nbv->emulateGpu == EmulateGpuNonbonded::Yes && nbv->bUseGPU), "When GPU emulation is active, there cannot be a GPU assignment");
- if (nbv->bUseGPU)
- {
- /* Use the assigned GPU. */
- init_gpu(mdlog, cr->nodeid, deviceInfo);
- }
-
nbv->nbs = nullptr;
nbv->min_ci_balanced = 0;
return nbv != nullptr && nbv->bUseGPU;
}
-void init_forcerec(FILE *fp,
- const gmx::MDLogger &mdlog,
- t_forcerec *fr,
- t_fcdata *fcd,
- const t_inputrec *ir,
- const gmx_mtop_t *mtop,
- const t_commrec *cr,
- matrix box,
- const char *tabfn,
- const char *tabpfn,
- const t_filenm *tabbfnm,
- gmx_device_info_t *deviceInfo,
- gmx_bool bNoSolvOpt,
- real print_force)
+void init_forcerec(FILE *fp,
+ const gmx::MDLogger &mdlog,
+ t_forcerec *fr,
+ t_fcdata *fcd,
+ const t_inputrec *ir,
+ const gmx_mtop_t *mtop,
+ const t_commrec *cr,
+ matrix box,
+ const char *tabfn,
+ const char *tabpfn,
+ const t_filenm *tabbfnm,
+ const gmx_device_info_t *deviceInfo,
+ gmx_bool bNoSolvOpt,
+ real print_force)
{
int i, m, negp_pp, negptable, egi, egj;
real rtab;
}
}
-/* Frees GPU memory and destroys the GPU context.
+/* Frees GPU memory and sets a tMPI node barrier.
*
* Note that this function needs to be called even if GPUs are not used
* in this run because the PME ranks have no knowledge of whether GPUs
* are used or not, but all ranks need to enter the barrier below.
+ * \todo Remove physical node barrier from this function after making sure
+ * that it's not needed anymore (with a shared GPU run).
*/
void free_gpu_resources(const t_forcerec *fr,
- const t_commrec *cr,
- const gmx_device_info_t *deviceInfo)
+ const t_commrec *cr)
{
- gmx_bool bIsPPrankUsingGPU;
- char gpu_err_str[STRLEN];
+ bool isPPrankUsingGPU = fr && fr->nbv && fr->nbv->bUseGPU;
- bIsPPrankUsingGPU = thisRankHasDuty(cr, DUTY_PP) && fr && fr->nbv && fr->nbv->bUseGPU;
+ /* stop the GPU profiler (only CUDA) */
+ stopGpuProfiler();
- if (bIsPPrankUsingGPU)
+ if (isPPrankUsingGPU)
{
/* free nbnxn data in GPU memory */
nbnxn_gpu_free(fr->nbv->gpu_nbv);
- /* stop the GPU profiler (only CUDA) */
- stopGpuProfiler();
}
/* With tMPI we need to wait for all ranks to finish deallocation before
* Note: it is safe to not call the barrier on the ranks which do not use GPU,
* but it is easier and more futureproof to call it on the whole node.
*/
-#if GMX_THREAD_MPI
- if (PAR(cr) || MULTISIM(cr))
+ if (GMX_THREAD_MPI && (PAR(cr) || MULTISIM(cr)))
{
gmx_barrier_physical_node(cr);
}
-#endif /* GMX_THREAD_MPI */
-
- if (bIsPPrankUsingGPU)
- {
- /* uninitialize GPU (by destroying the context) */
- if (!free_cuda_gpu(deviceInfo, gpu_err_str))
- {
- gmx_warning("On rank %d failed to free GPU #%d: %s",
- cr->nodeid, get_current_cuda_gpu_device_id(), gpu_err_str);
- }
- }
}
* \param[in] bNoSolvOpt Do not use solvent optimization
* \param[in] print_force Print forces for atoms with force >= print_force
*/
-void init_forcerec(FILE *fplog,
- const gmx::MDLogger &mdlog,
- t_forcerec *fr,
- t_fcdata *fcd,
- const t_inputrec *ir,
- const gmx_mtop_t *mtop,
- const t_commrec *cr,
- matrix box,
- const char *tabfn,
- const char *tabpfn,
- const t_filenm *tabbfnm,
- gmx_device_info_t *deviceInfo,
- gmx_bool bNoSolvOpt,
- real print_force);
+void init_forcerec(FILE *fplog,
+ const gmx::MDLogger &mdlog,
+ t_forcerec *fr,
+ t_fcdata *fcd,
+ const t_inputrec *ir,
+ const gmx_mtop_t *mtop,
+ const t_commrec *cr,
+ matrix box,
+ const char *tabfn,
+ const char *tabpfn,
+ const t_filenm *tabbfnm,
+ const gmx_device_info_t *deviceInfo,
+ gmx_bool bNoSolvOpt,
+ real print_force);
/*! \brief Divide exclusions over threads
*
const t_commrec *cr,
const std::vector<GpuTask> &gpuTasksOnThisRank);
+//! Function for whether the task of \c mapping has value \c TaskType.
+template<GpuTask TaskType>
+bool hasTaskType(const GpuTaskMapping &mapping)
+{
+ return mapping.task_ == TaskType;
+}
+
} // namespace
#endif
#include "gromacs/mdlib/mdrun.h"
#include "gromacs/mdlib/minimize.h"
#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
#include "gromacs/mdlib/nbnxn_search.h"
#include "gromacs/mdlib/nbnxn_tuning.h"
#include "gromacs/mdlib/qmmm.h"
cr, mdlog);
gmx_device_info_t *nonbondedDeviceInfo = nullptr;
- int nonbondedDeviceId = -1;
+
if (thisRankHasDuty(cr, DUTY_PP))
{
- if (!gpuTaskAssignment.empty())
+ // This works because only one task of each type is currently permitted.
+ auto nbGpuTaskMapping = std::find_if(gpuTaskAssignment.begin(), gpuTaskAssignment.end(),
+ hasTaskType<GpuTask::Nonbonded>);
+ if (nbGpuTaskMapping != gpuTaskAssignment.end())
{
- GMX_RELEASE_ASSERT(gpuTaskAssignment.size() == 1, "A valid GPU assignment can only have one task per rank");
- GMX_RELEASE_ASSERT(gpuTaskAssignment[0].task_ == gmx::GpuTask::Nonbonded, "A valid GPU assignment can only include short-ranged tasks");
- nonbondedDeviceId = gpuTaskAssignment[0].deviceId_;
+ int nonbondedDeviceId = nbGpuTaskMapping->deviceId_;
nonbondedDeviceInfo = getDeviceInfo(hwinfo->gpu_info, nonbondedDeviceId);
- }
- }
+ init_gpu(mdlog, nonbondedDeviceInfo);
- if (DOMAINDECOMP(cr))
- {
- /* When we share GPUs over ranks, we need to know this for the DLB */
- dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId);
+ if (DOMAINDECOMP(cr))
+ {
+ /* When we share GPUs over ranks, we need to know this for the DLB */
+ dd_setup_dlb_resource_sharing(cr, nonbondedDeviceId);
+ }
+
+ }
}
/* getting number of PP/PME threads
pmedata = nullptr;
}
- /* Free GPU memory and context */
- free_gpu_resources(fr, cr, nonbondedDeviceInfo);
+ // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
+ // before we destroy the GPU context(s) in free_gpu_resources().
+ // Pinned buffers are associated with contexts in CUDA.
+ // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
+ mdAtoms.reset(nullptr);
+ globalState.reset(nullptr);
+
+ /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
+ free_gpu_resources(fr, cr);
+ free_gpu(nonbondedDeviceInfo);
if (doMembed)
{