#include "gromacs/mdtypes/interaction_const.h"
#include "gromacs/mdtypes/simulation_workload.h"
#include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
+#include "gromacs/timing/wallcycle.h"
#include "gromacs/utility/gmxassert.h"
#include "gpubonded_impl.h"
GMX_ASSERT(haveInteractions_,
"Cannot launch bonded GPU kernels unless bonded GPU work was scheduled");
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_BONDED);
+
int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
if (fTypeRangeEnd < 0)
launchGpuKernel(kernelPtr, kernelLaunchConfig_, deviceStream_, nullptr,
"exec_kernel_gpu<calcVir, calcEner>", kernelArgs);
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
void GpuBonded::launchKernel(const gmx::StepWorkload& stepWork)
/* initialize the GPU nbnxm atom data and bonded data structures */
if (simulationWork.useGpuNonbonded)
{
+ // Note: cycle counting only nononbondeds, gpuBonded counts internally
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat.get());
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU);
if (fr->gpuBonded)
{
nbv->getGridIndices(), top->idef, Nbnxm::gpu_get_xq(nbv->gpu_nbv),
Nbnxm::gpu_get_f(nbv->gpu_nbv), Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
}
- wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
// Need to run after the GPU-offload bonded interaction lists
ddBalanceRegionHandler.openBeforeForceComputationGpu();
wallcycle_start(wcycle, ewcLAUNCH_GPU);
-
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::Local);
}
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU);
// with X buffer ops offloaded to the GPU on all but the search steps
// bonded work not split into separate local and non-local, so with DD
// we can only launch the kernel after non-local coordinates have been received.
if (domainWork.haveGpuBondedWork && !havePPDomainDecomposition(cr))
{
- wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
}
/* launch local nonbonded work on GPU */
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFNo, step, nrnb, wcycle);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (simulationWork.useGpuNonbonded)
{
- wallcycle_start(wcycle, ewcLAUNCH_GPU);
if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
{
+ wallcycle_start(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::NonLocal);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
if (domainWork.haveGpuBondedWork)
{
- wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
}
/* launch non-local nonbonded tasks on GPU */
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step,
nrnb, wcycle);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
}
// TODO Force flags should include haveFreeEnergyWork for this domain
if (stepWork.useGpuXHalo && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
{
+ wallcycle_stop(wcycle, ewcFORCE);
/* Wait for non-local coordinate data to be copied from device */
stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
+ wallcycle_start_nocount(wcycle, ewcFORCE);
}
// Compute wall interactions, when present.
static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
-/* DEBUG_WCYCLE adds consistency checking for the counters.
- * It checks if you stop a counter different from the last
+#ifndef DEBUG_WCYCLE
+/*! \brief Enables consistency checking for the counters.
+ *
+ * If the macro is set to 1, code checks if you stop a counter different from the last
* one that was opened and if you do nest too deep.
*/
-/* #define DEBUG_WCYCLE */
-
-#ifdef DEBUG_WCYCLE
+# define DEBUG_WCYCLE 0
+#endif
+//! Whether wallcycle debugging is enabled
+constexpr bool gmx_unused enableWallcycleDebug = (DEBUG_WCYCLE != 0);
+//! True if only the master rank should print debugging output
+constexpr bool gmx_unused onlyMasterDebugPrints = true;
+//! True if cycle counter nesting depth debuggin prints are enabled
+constexpr bool gmx_unused debugPrintDepth = false /* enableWallcycleDebug */;
+
+#if DEBUG_WCYCLE
# include "gromacs/utility/fatalerror.h"
#endif
gmx_bool wc_barrier;
wallcc_t* wcc_all;
int wc_depth;
-#ifdef DEBUG_WCYCLE
+#if DEBUG_WCYCLE
# define DEPTH_MAX 6
- int counterlist[DEPTH_MAX];
- int count_depth;
+ int counterlist[DEPTH_MAX];
+ int count_depth;
+ bool isMasterRank;
#endif
int ewc_prev;
gmx_cycles_t cycle_prev;
wc->ewc_prev = -1;
wc->reset_counters = resetstep;
+
#if GMX_MPI
if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
{
snew(wc->wcsc, ewcsNR);
}
-#ifdef DEBUG_WCYCLE
- wc->count_depth = 0;
+#if DEBUG_WCYCLE
+ wc->count_depth = 0;
+ wc->isMasterRank = MASTER(cr);
#endif
return wc;
}
-#ifdef DEBUG_WCYCLE
+#if DEBUG_WCYCLE
static void debug_start_check(gmx_wallcycle_t wc, int ewc)
{
- /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
-
if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
{
- gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth);
+ gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth + 1);
}
wc->counterlist[wc->count_depth] = ewc;
wc->count_depth++;
+
+ if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
+ {
+ std::string indentStr(4 * wc->count_depth, ' ');
+ fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
+ }
}
static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
{
- wc->count_depth--;
+ if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
+ {
+ std::string indentStr(4 * wc->count_depth, ' ');
+ fprintf(stderr, "%swcycle_stop depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
+ }
- /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
+ wc->count_depth--;
if (wc->count_depth < 0)
{
}
#endif
-#ifdef DEBUG_WCYCLE
+#if DEBUG_WCYCLE
debug_start_check(wc, ewc);
#endif
}
#endif
-#ifdef DEBUG_WCYCLE
+#if DEBUG_WCYCLE
debug_stop_check(wc, ewc);
#endif