#include "nbnxn_search.h"
#include "bondf.h"
#include "gmx_omp_nthreads.h"
+#include "gpu_utils.h"
#ifdef GMX_LIB_MPI
#include <mpi.h>
/* Stuff for load communication */
gmx_bool bRecordLoad;
gmx_domdec_load_t *load;
+ int nrank_gpu_shared;
#ifdef GMX_MPI
MPI_Comm *mpi_comm_load;
+ MPI_Comm mpi_comm_gpu_shared;
#endif
/* Maximum DLB scaling per load balancing step in percent */
if (comm->cycl_n[ddCyclF] > 1)
{
/* Subtract the maximum of the last n cycle counts
- * to get rid of possible high counts due to other soures,
+ * to get rid of possible high counts due to other sources,
* for instance system activity, that would otherwise
* affect the dynamic load balancing.
*/
load -= comm->cycl_max[ddCyclF];
}
+
+#ifdef GMX_MPI
+ if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
+ {
+ float gpu_wait, gpu_wait_sum;
+
+ gpu_wait = comm->cycl[ddCyclWaitGPU];
+ if (comm->cycl_n[ddCyclF] > 1)
+ {
+ /* We should remove the WaitGPU time of the same MD step
+ * as the one with the maximum F time, since the F time
+ * and the wait time are not independent.
+ * Furthermore, the step for the max F time should be chosen
+ * the same on all ranks that share the same GPU.
+ * But to keep the code simple, we remove the average instead.
+ * The main reason for artificially long times at some steps
+ * is spurious CPU activity or MPI time, so we don't expect
+ * that changes in the GPU wait time matter a lot here.
+ */
+ gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
+ }
+ /* Sum the wait times over the ranks that share the same GPU */
+ MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
+ comm->mpi_comm_gpu_shared);
+ /* Replace the wait time by the average over the ranks */
+ load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
+ }
+#endif
}
return load;
}
#endif
+void dd_setup_dlb_resource_sharing(t_commrec *cr,
+ const gmx_hw_info_t *hwinfo,
+ const gmx_hw_opt_t *hw_opt)
+{
+#ifdef GMX_MPI
+ int physicalnode_id_hash;
+ int gpu_id;
+ gmx_domdec_t *dd;
+ MPI_Comm mpi_comm_pp_physicalnode;
+
+ if (!(cr->duty & DUTY_PP) ||
+ hw_opt->gpu_opt.ncuda_dev_use == 0)
+ {
+ /* Only PP nodes (currently) use GPUs.
+ * If we don't have GPUs, there are no resources to share.
+ */
+ return;
+ }
+
+ physicalnode_id_hash = gmx_physicalnode_id_hash();
+
+ gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->nodeid);
+
+ dd = cr->dd;
+
+ if (debug)
+ {
+ fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
+ fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
+ dd->rank, physicalnode_id_hash, gpu_id);
+ }
+ /* Split the PP communicator over the physical nodes */
+ /* TODO: See if we should store this (before), as it's also used for
+ * for the nodecomm summution.
+ */
+ MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
+ &mpi_comm_pp_physicalnode);
+ MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
+ &dd->comm->mpi_comm_gpu_shared);
+ MPI_Comm_free(&mpi_comm_pp_physicalnode);
+ MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
+
+ if (debug)
+ {
+ fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
+ }
+
+ /* Note that some ranks could share a GPU, while others don't */
+
+ if (dd->comm->nrank_gpu_shared == 1)
+ {
+ MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
+ }
+#endif
+}
+
static void make_load_communicators(gmx_domdec_t *dd)
{
#ifdef GMX_MPI
}
+ /* Initialize to GPU share count to 0, might change later */
+ comm->nrank_gpu_shared = 0;
+
comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
comm->bDynLoadBal = (comm->eDLB == edlbYES);