if (!isNbGpuDone)
{
GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
- wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
flags,
Nbnxm::AtomLocality::Local,
enerd->grpp.ener[egLJSR].data(),
enerd->grpp.ener[egCOULSR].data(),
- forceWithShiftForces.shiftForces(), completionType);
- wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
- // To get the call count right, when the task finished we
- // issue a start/stop.
- // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
- // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
+ forceWithShiftForces.shiftForces(), completionType, wcycle);
+
if (isNbGpuDone)
{
- wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
- wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
-
nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
forceWithShiftForces.force());
}
{
if (bUseGPU)
{
- wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
- Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
- flags, Nbnxm::AtomLocality::NonLocal,
- enerd->grpp.ener[egLJSR].data(),
- enerd->grpp.ener[egCOULSR].data(),
- forceWithShiftForces.shiftForces());
- cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+ cycles_wait_gpu += Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+ flags, Nbnxm::AtomLocality::NonLocal,
+ enerd->grpp.ener[egLJSR].data(),
+ enerd->grpp.ener[egCOULSR].data(),
+ forceWithShiftForces.shiftForces(),
+ wcycle);
}
else
{
* of the step time.
*/
const float gpuWaitApiOverheadMargin = 2e6F; /* cycles */
-
- wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
- Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
- flags, Nbnxm::AtomLocality::Local,
- enerd->grpp.ener[egLJSR].data(),
- enerd->grpp.ener[egCOULSR].data(),
- forceOut.forceWithShiftForces().shiftForces());
- float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+ const float waitCycles =
+ Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+ flags, Nbnxm::AtomLocality::Local,
+ enerd->grpp.ener[egLJSR].data(),
+ enerd->grpp.ener[egCOULSR].data(),
+ forceOut.forceWithShiftForces().shiftForces(),
+ wcycle);
if (ddBalanceRegionHandler.useBalancingRegion())
{
DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
- if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
+ if (bDoForces && waitCycles <= gpuWaitApiOverheadMargin)
{
/* We measured few cycles, it could be that the kernel
* and transfer finished earlier and there was no actual
#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/timing/gpu_timing.h"
+#include "gromacs/timing/wallcycle.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/stringutil.h"
real *e_lj,
real *e_el,
gmx::ArrayRef<gmx::RVec> shiftForces,
- GpuTaskCompletion completionKind)
+ GpuTaskCompletion completionKind,
+ gmx_wallcycle *wcycle)
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
// Query the state of the GPU stream and return early if we're not done
if (completionKind == GpuTaskCompletion::Check)
{
+ // To get the wcycle call count right, when in GpuTaskCompletion::Check mode,
+ // we start without counting and only when the task finished we issue a
+ // start/stop to increment.
+ // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
+ wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
+
if (!haveStreamTasksCompleted(nb->stream[iLocality]))
{
+ wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+
// Early return to skip the steps below that we have to do only
// after the NB task completed
return false;
}
+
+ wallcycle_increment_event_count(wcycle, ewcWAIT_GPU_NB_L);
}
else
{
* \param[out] e_lj Pointer to the LJ energy output to accumulate into
* \param[out] e_el Pointer to the electrostatics energy output to accumulate into
* \param[out] shiftForces Shift forces buffer to accumulate into
+ * \param[out] wcycle Pointer to wallcycle data structure
+ * \return The number of cycles the gpu wait took
*/
//NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
-void gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
- int flags,
- AtomLocality aloc,
- real *e_lj,
- real *e_el,
- gmx::ArrayRef<gmx::RVec> shiftForces)
+float gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
+ int flags,
+ AtomLocality aloc,
+ real *e_lj,
+ real *e_el,
+ gmx::ArrayRef<gmx::RVec> shiftForces,
+ gmx_wallcycle *wcycle)
{
+ auto cycleCounter =
+ (gpuAtomToInteractionLocality(aloc) == InteractionLocality::Local) ? ewcWAIT_GPU_NB_L : ewcWAIT_GPU_NB_NL;
+
+ wallcycle_start(wcycle, cycleCounter);
gpu_try_finish_task(nb, flags, aloc, e_lj, e_el, shiftForces,
- GpuTaskCompletion::Wait);
+ GpuTaskCompletion::Wait, wcycle);
+ float waitTime = wallcycle_stop(wcycle, cycleCounter);
+
+ return waitTime;
}
} // namespace Nbnxm
#include "locality.h"
struct nbnxn_atomdata_t;
+struct gmx_wallcycle;
enum class GpuTaskCompletion;
namespace gmx
* - auxiliary tasks: updating the internal module state (timing accumulation, list pruning states) and
* - internal staging reduction of (\p fshift, \p e_el, \p e_lj).
*
+ * In GpuTaskCompletion::Check mode this function does the timing and keeps correct count
+ * for the nonbonded task (incrementing only once per taks), in the GpuTaskCompletion::Wait mode
+ * timing is expected to be done in the caller.
+ *
* TODO: improve the handling of outputs e.g. by ensuring that this function explcitly returns the
* force buffer (instead of that being passed only to nbnxn_gpu_launch_cpyback()) and by returning
* the energy and Fshift contributions for some external/centralized reduction.
* \param[out] e_el Pointer to the electrostatics energy output to accumulate into
* \param[out] shiftForces Shift forces buffer to accumulate into
* \param[in] completionKind Indicates whether nnbonded task completion should only be checked rather than waited for
+ * \param[out] wcycle Pointer to wallcycle data structure
* \returns True if the nonbonded tasks associated with \p aloc locality have completed
*/
GPU_FUNC_QUALIFIER
-bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
- int gmx_unused flags,
- AtomLocality gmx_unused aloc,
- real gmx_unused *e_lj,
- real gmx_unused *e_el,
+bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
+ int gmx_unused flags,
+ AtomLocality gmx_unused aloc,
+ real gmx_unused *e_lj,
+ real gmx_unused *e_el,
gmx::ArrayRef<gmx::RVec> gmx_unused shiftForces,
- GpuTaskCompletion gmx_unused completionKind) GPU_FUNC_TERM_WITH_RETURN(false);
+ GpuTaskCompletion gmx_unused completionKind,
+ gmx_wallcycle gmx_unused *wcycle) GPU_FUNC_TERM_WITH_RETURN(false);
/*! \brief Completes the nonbonded GPU task blocking until GPU tasks and data
* transfers to finish.
* \param[out] shiftForces Shift forces buffer to accumulate into
*/
GPU_FUNC_QUALIFIER
-void gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
- int gmx_unused flags,
- AtomLocality gmx_unused aloc,
- real gmx_unused *e_lj,
- real gmx_unused *e_el,
- gmx::ArrayRef<gmx::RVec> gmx_unused shiftForces) GPU_FUNC_TERM;
+float gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
+ int gmx_unused flags,
+ AtomLocality gmx_unused aloc,
+ real gmx_unused *e_lj,
+ real gmx_unused *e_el,
+ gmx::ArrayRef<gmx::RVec> gmx_unused shiftForces,
+ gmx_wallcycle gmx_unused *wcycle) GPU_FUNC_TERM_WITH_RETURN(0.0);
/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
GPU_FUNC_QUALIFIER
}
}
+void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc)
+{
+ if (wc == nullptr)
+ {
+ return;
+ }
+ wc->wcc[ewc].n++;
+}
+
void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
{
if (wc == nullptr)
double wallcycle_stop(gmx_wallcycle_t wc, int ewc);
/* Stop the cycle count for ewc, returns the last cycle count */
+void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc);
+/* Only increment call count for ewc by one */
+
void wallcycle_get(gmx_wallcycle_t wc, int ewc, int *n, double *c);
/* Returns the cumulative count and cycle count for ewc */