* possible after subsequently setting a shorter cut-off with change_dd_cutoff.
*/
+GMX_LIBMD_EXPORT
+void dd_setup_dlb_resource_sharing(t_commrec *cr,
+ const gmx_hw_info_t *hwinfo,
+ const gmx_hw_opt_t *hw_opt);
+/* When domains (PP MPI ranks) share a GPU, the individual GPU wait times
+ * are meaningless, as it depends on the order in which tasks on the same
+ * GPU finish. Therefore there wait times need to be averaged over the ranks
+ * sharing the same GPU. This function sets up the communication for that.
+ */
+
GMX_LIBMD_EXPORT
void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd);
t_state *state_local, t_state *state);
enum {
- ddCyclStep, ddCyclPPduringPME, ddCyclF, ddCyclPME, ddCyclNr
+ ddCyclStep, ddCyclPPduringPME, ddCyclF, ddCyclWaitGPU, ddCyclPME, ddCyclNr
};
GMX_LIBMD_EXPORT
#include "nbnxn_search.h"
#include "bondf.h"
#include "gmx_omp_nthreads.h"
+#include "gpu_utils.h"
#ifdef GMX_LIB_MPI
#include <mpi.h>
/* Stuff for load communication */
gmx_bool bRecordLoad;
gmx_domdec_load_t *load;
+ int nrank_gpu_shared;
#ifdef GMX_MPI
MPI_Comm *mpi_comm_load;
+ MPI_Comm mpi_comm_gpu_shared;
#endif
/* Maximum DLB scaling per load balancing step in percent */
if (comm->cycl_n[ddCyclF] > 1)
{
/* Subtract the maximum of the last n cycle counts
- * to get rid of possible high counts due to other soures,
+ * to get rid of possible high counts due to other sources,
* for instance system activity, that would otherwise
* affect the dynamic load balancing.
*/
load -= comm->cycl_max[ddCyclF];
}
+
+#ifdef GMX_MPI
+ if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
+ {
+ float gpu_wait, gpu_wait_sum;
+
+ gpu_wait = comm->cycl[ddCyclWaitGPU];
+ if (comm->cycl_n[ddCyclF] > 1)
+ {
+ /* We should remove the WaitGPU time of the same MD step
+ * as the one with the maximum F time, since the F time
+ * and the wait time are not independent.
+ * Furthermore, the step for the max F time should be chosen
+ * the same on all ranks that share the same GPU.
+ * But to keep the code simple, we remove the average instead.
+ * The main reason for artificially long times at some steps
+ * is spurious CPU activity or MPI time, so we don't expect
+ * that changes in the GPU wait time matter a lot here.
+ */
+ gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
+ }
+ /* Sum the wait times over the ranks that share the same GPU */
+ MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
+ comm->mpi_comm_gpu_shared);
+ /* Replace the wait time by the average over the ranks */
+ load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
+ }
+#endif
}
return load;
}
#endif
+void dd_setup_dlb_resource_sharing(t_commrec *cr,
+ const gmx_hw_info_t *hwinfo,
+ const gmx_hw_opt_t *hw_opt)
+{
+#ifdef GMX_MPI
+ int physicalnode_id_hash;
+ int gpu_id;
+ gmx_domdec_t *dd;
+ MPI_Comm mpi_comm_pp_physicalnode;
+
+ if (!(cr->duty & DUTY_PP) ||
+ hw_opt->gpu_opt.ncuda_dev_use == 0)
+ {
+ /* Only PP nodes (currently) use GPUs.
+ * If we don't have GPUs, there are no resources to share.
+ */
+ return;
+ }
+
+ physicalnode_id_hash = gmx_physicalnode_id_hash();
+
+ gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->nodeid);
+
+ dd = cr->dd;
+
+ if (debug)
+ {
+ fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
+ fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
+ dd->rank, physicalnode_id_hash, gpu_id);
+ }
+ /* Split the PP communicator over the physical nodes */
+ /* TODO: See if we should store this (before), as it's also used for
+ * for the nodecomm summution.
+ */
+ MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
+ &mpi_comm_pp_physicalnode);
+ MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
+ &dd->comm->mpi_comm_gpu_shared);
+ MPI_Comm_free(&mpi_comm_pp_physicalnode);
+ MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
+
+ if (debug)
+ {
+ fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
+ }
+
+ /* Note that some ranks could share a GPU, while others don't */
+
+ if (dd->comm->nrank_gpu_shared == 1)
+ {
+ MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
+ }
+#endif
+}
+
static void make_load_communicators(gmx_domdec_t *dd)
{
#ifdef GMX_MPI
}
+ /* Initialize to GPU share count to 0, might change later */
+ comm->nrank_gpu_shared = 0;
+
comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
comm->bDynLoadBal = (comm->eDLB == edlbYES);
matrix boxs;
rvec vzero, box_diag;
real e, v, dvdl;
- float cycles_pme, cycles_force;
+ float cycles_pme, cycles_force, cycles_wait_gpu;
nonbonded_verlet_t *nbv;
- cycles_force = 0;
- nbv = fr->nbv;
- nb_kernel_type = fr->nbv->grp[0].kernel_type;
+ cycles_force = 0;
+ cycles_wait_gpu = 0;
+ nbv = fr->nbv;
+ nb_kernel_type = fr->nbv->grp[0].kernel_type;
start = mdatoms->start;
homenr = mdatoms->homenr;
{
if (bUseGPU)
{
+ float cycles_tmp;
+
wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
nbnxn_cuda_wait_gpu(nbv->cu_nbv,
nbv->grp[eintNonlocal].nbat,
flags, eatNonlocal,
enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
fr->fshift);
- cycles_force += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+ cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+ cycles_wait_gpu += cycles_tmp;
+ cycles_force += cycles_tmp;
}
else
{
flags, eatLocal,
enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
fr->fshift);
- wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+ cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
/* now clear the GPU outputs while we finish the step on the CPU */
if (wcycle)
{
dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
+ if (bUseGPU)
+ {
+ dd_cycles_add(cr->dd, cycles_wait_gpu, ddCyclWaitGPU);
+ }
}
}