over-ride the number of DD pulses used
(default 0, meaning no over-ride). Normally 1 or 2.
+``GMX_DISABLE_ALTERNATING_GPU_WAIT``
+ disables the specialized polling wait path used to wait for the PME and nonbonded
+ GPU tasks completion to overlap to do the reduction of the resulting forces that
+ arrive first. Setting this variable switches to the generic path with fixed waiting
+ order.
+
There are a number of extra environment variables like these
that are used in debugging - check the code!
*
* \param[in] pmeGpu The PME GPU structure.
*/
-static void pme_gpu_reinit_computation(const PmeGpu *pmeGpu)
+void pme_gpu_reinit_computation(const PmeGpu *pmeGpu)
{
pme_gpu_clear_grids(pmeGpu);
pme_gpu_clear_energy_virial(pmeGpu);
}
-void pme_gpu_finish_computation(const PmeGpu *pmeGpu)
-{
- // Synchronize the whole PME stream at once, including D2H result transfers.
- pme_gpu_synchronize(pmeGpu);
-
- pme_gpu_update_timings(pmeGpu);
- pme_gpu_reinit_computation(pmeGpu);
-}
-
/*! \brief \libinternal
* (Re-)initializes all the PME GPU data related to the grid size and cut-off.
*
*/
CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM
+/*! \libinternal \brief
+ * Checks whether work in the PME GPU stream has completed.
+ *
+ * \param[in] pmeGpu The PME GPU structure.
+ *
+ * \returns True if work in the PME stream has completed.
+ */
+CUDA_FUNC_QUALIFIER bool pme_gpu_stream_query(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGpu)) CUDA_FUNC_TERM_WITH_RETURN(0)
+
/*! \libinternal \brief
* Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
*
/*! \libinternal \brief
* Returns the output virial and energy of the PME solving.
- * Should be called after pme_gpu_finish_computation.
*
* \param[in] pmeGpu The PME GPU structure.
* \param[out] energy The output energy.
const int nAtoms,
const real *charges);
+/*! \brief \libinternal
+ * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
+ *
+ * This clears the device-side working buffers in preparation for new computation.
+ *
+ * \param[in] pmeGpu The PME GPU structure.
+ */
+void pme_gpu_reinit_computation(const PmeGpu *pmeGpu);
+
+
#endif
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
-void
-pme_gpu_wait_for_gpu(const gmx_pme_t *pme,
- gmx_wallcycle_t wcycle,
- gmx::ArrayRef<const gmx::RVec> *forces,
- matrix virial,
- real *energy)
+/*! \brief Reduce staged virial and energy outputs.
+ *
+ * \param[in] pme The PME data structure.
+ * \param[out] forces Output forces pointer, the internal ArrayRef pointers gets assigned to it.
+ * \param[out] virial The output virial matrix.
+ * \param[out] energy The output energy.
+ */
+static void pme_gpu_get_staged_results(const gmx_pme_t *pme,
+ gmx::ArrayRef<const gmx::RVec> *forces,
+ matrix virial,
+ real *energy)
{
- GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
-
const bool haveComputedEnergyAndVirial = pme->gpu->settings.currentFlags & GMX_PME_CALC_ENER_VIR;
-
- wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
- pme_gpu_finish_computation(pme->gpu);
- wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
-
*forces = pme_gpu_get_forces(pme->gpu);
if (haveComputedEnergyAndVirial)
}
}
}
+
+bool pme_gpu_try_finish_task(const gmx_pme_t *pme,
+ gmx_wallcycle_t wcycle,
+ gmx::ArrayRef<const gmx::RVec> *forces,
+ matrix virial,
+ real *energy,
+ GpuTaskCompletion completionKind)
+{
+ GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
+
+ wallcycle_start_nocount(wcycle, ewcWAIT_GPU_PME_GATHER);
+
+ if (completionKind == GpuTaskCompletion::Check)
+ {
+ // Query the PME stream for completion of all tasks enqueued and
+ // if we're not done, stop the timer before early return.
+ if (!pme_gpu_stream_query(pme->gpu))
+ {
+ wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
+ return false;
+ }
+ }
+ else
+ {
+ // Synchronize the whole PME stream at once, including D2H result transfers.
+ pme_gpu_synchronize(pme->gpu);
+ }
+ wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
+
+ // Time the final staged data handling separately with a counting call to get
+ // the call count right.
+ wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
+
+ // The computation has completed, do timing accounting and resetting buffers
+ pme_gpu_update_timings(pme->gpu);
+ // TODO: move this later and launch it together with the other
+ // non-bonded tasks at the end of the step
+ pme_gpu_reinit_computation(pme->gpu);
+
+ pme_gpu_get_staged_results(pme, forces, virial, energy);
+
+ wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
+
+ return true;
+}
+
+void pme_gpu_wait_finish_task(const gmx_pme_t *pme,
+ gmx_wallcycle_t wcycle,
+ gmx::ArrayRef<const gmx::RVec> *forces,
+ matrix virial,
+ real *energy)
+{
+ pme_gpu_try_finish_task(pme, wcycle, forces, virial, energy, GpuTaskCompletion::Wait);
+}
pme_gpu_launch_spread(pme, as_rvec_array(pme_pp->x.data()), wcycle);
pme_gpu_launch_complex_transforms(pme, wcycle);
pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
- pme_gpu_wait_for_gpu(pme, wcycle, &forces, vir_q, &energy_q);
+ pme_gpu_wait_finish_task(pme, wcycle, &forces, vir_q, &energy_q);
}
else
{
pmeGpu->deviceInfo);
}
+bool pme_gpu_stream_query(const PmeGpu *pmeGpu)
+{
+ return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream);
+}
+
void pme_gpu_copy_input_gather_grid(const PmeGpu *pmeGpu, float *h_grid)
{
const size_t gridSize = pmeGpu->archSpecific->realGridSize * sizeof(float);
struct gmx_device_info_t;
struct gmx_pme_t;
+enum class GpuTaskCompletion;
+
namespace gmx
{
class ForceWithVirial;
* \param[out] virial The output virial matrix.
* \param[out] energy The output energy.
*/
-void pme_gpu_wait_for_gpu(const gmx_pme_t *pme,
- gmx_wallcycle_t wcycle,
- gmx::ArrayRef<const gmx::RVec> *forces,
- matrix virial,
- real *energy);
+void pme_gpu_wait_finish_task(const gmx_pme_t *pme,
+ gmx_wallcycle_t wcycle,
+ gmx::ArrayRef<const gmx::RVec> *forces,
+ matrix virial,
+ real *energy);
+/*! \brief
+ * Attempts to complete PME GPU tasks.
+ *
+ * The \p completionKind argument controls whether the function blocks until all
+ * PME GPU tasks enqueued completed (as pme_gpu_wait_finish_task() does) or only
+ * checks and returns immediately if they did not.
+ * When blocking or the tasks have completed it also gets the output forces
+ * by assigning the ArrayRef to the \p forces pointer passed in.
+ * Virial/energy are also outputs if they were to be computed.
+ *
+ * Note: also launches the reinitalization of the PME output buffers.
+ * TODO: this should be moved out to avoid miscounting its wall-time (as wait iso launch).
+ *
+ * \param[in] pme The PME data structure.
+ * \param[in] wcycle The wallclock counter.
+ * \param[out] forces The output forces.
+ * \param[out] virial The output virial matrix.
+ * \param[out] energy The output energy.
+ * \param[in] completionKind Indicates whether PME task completion should only be checked rather than waited for
+ * \returns True if the PME GPU tasks have completed
+ */
+bool pme_gpu_try_finish_task(const gmx_pme_t *pme,
+ gmx_wallcycle_t wcycle,
+ gmx::ArrayRef<const gmx::RVec> *forces,
+ matrix virial,
+ real *energy,
+ GpuTaskCompletion completionKind);
+
#endif
rvec_inc(a, tmp);
}
-/*! \brief Calls cudaStreamSynchronize() in the stream \p s.
+/*! \brief Wait for all taks in stream \p s to complete.
*
* \param[in] s stream to synchronize with
*/
CU_RET_ERR(stat, "cudaStreamSynchronize failed");
}
+/*! \brief Returns true if all tasks in \p s have completed.
+ *
+ * \param[in] s stream to check
+ *
+ * \returns True if all tasks enqueued in the stream \p s (at the time of this call) have completed.
+ */
+static inline bool haveStreamTasksCompleted(cudaStream_t s)
+{
+ cudaError_t stat = cudaStreamQuery(s);
+
+ if (stat == cudaErrorNotReady)
+ {
+ // work is still in progress in the stream
+ return false;
+ }
+
+ GMX_ASSERT(stat != cudaErrorInvalidResourceHandle, "Stream idnetifier not valid");
+
+ // cudaSuccess and cudaErrorNotReady are the expected return values
+ CU_RET_ERR(stat, "Unexpected cudaStreamQuery failure");
+
+ GMX_ASSERT(stat == cudaSuccess, "Values other than cudaSuccess should have been explicitly handled");
+
+ return true;
+}
+
#endif
Async
};
+//! Types of actions associated to waiting or checking the completion of GPU tasks
+enum class GpuTaskCompletion
+{
+ Wait, /*<< Issue a blocking wait for the task */
+ Check /*<< Only check whether the task has completed */
+};
+
/*! \brief Detect all GPUs in the system.
*
* Will detect every GPU supported by the device driver in use. Also
("Error caught during clFinish:" + ocl_get_error_string(cl_error)).c_str());
}
+/*! \brief Pretend to synchronize an OpenCL stream (dummy implementation).
+ *
+ * \param[in] s queue to check
+ *
+ * \returns True if all tasks enqueued in the stream \p s (at the time of this call) have completed.
+ */
+static inline bool haveStreamTasksCompleted(cl_command_queue gmx_unused s)
+{
+ GMX_RELEASE_ASSERT(false, "haveStreamTasksCompleted is not implemented for OpenCL");
+ return false;
+}
+
#endif
#endif
struct nbnxn_atomdata_t;
+enum class GpuTaskCompletion;
/*! \brief
* Launch asynchronously the nonbonded force calculations.
int gmx_unused flags,
int gmx_unused aloc) GPU_FUNC_TERM
-/*! \brief
- * Wait for the asynchronously launched nonbonded tasks and data
+/*! \brief Attempts to complete nonbonded GPU task.
+ *
+ * This function attempts to complete the nonbonded task (both GPU and CPU auxiliary work).
+ * Success, i.e. that the tasks completed and results are ready to be consumed, is signaled
+ * by the return value (always true if blocking wait mode requested).
+ *
+ * The \p completionKind parameter controls whether the behavior is non-blocking
+ * (achieved by passing GpuTaskCompletion::Check) or blocking wait until the results
+ * are ready (when GpuTaskCompletion::Wait is passed).
+ * As the "Check" mode the function will return immediately if the GPU stream
+ * still contain tasks that have not completed, it allows more flexible overlapping
+ * of work on the CPU with GPU execution.
+ *
+ * Note that it is only safe to use the results, and to continue to the next MD
+ * step when this function has returned true which indicates successful completion of
+ * - All nonbonded GPU tasks: both compute and device transfer(s)
+ * - auxiliary tasks: updating the internal module state (timing accumulation, list pruning states) and
+ * - internal staging reduction of (\p fshift, \p e_el, \p e_lj).
+ *
+ * TODO: improve the handling of outputs e.g. by ensuring that this function explcitly returns the
+ * force buffer (instead of that being passed only to nbnxn_gpu_launch_cpyback()) and by returning
+ * the energy and Fshift contributions for some external/centralized reduction.
+ *
+ * \param[in] nb The nonbonded data GPU structure
+ * \param[in] flags Force flags
+ * \param[in] aloc Atom locality identifier
+ * \param[out] e_lj Pointer to the LJ energy output to accumulate into
+ * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
+ * \param[out] fshift Pointer to the shift force buffer to accumulate into
+ * \param[in] completionKind Indicates whether nnbonded task completion should only be checked rather than waited for
+ * \returns True if the nonbonded tasks associated with \p aloc locality have completed
+ */
+GPU_FUNC_QUALIFIER
+bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
+ int gmx_unused flags,
+ int gmx_unused aloc,
+ real gmx_unused *e_lj,
+ real gmx_unused *e_el,
+ rvec gmx_unused *fshift,
+ GpuTaskCompletion gmx_unused completionKind) GPU_FUNC_TERM_WITH_RETURN(false)
+
+/*! \brief Completes the nonbonded GPU task blocking until GPU tasks and data
* transfers to finish.
*
* Also does timing accounting and reduction of the internal staging buffers.
* \param[out] fshift Pointer to the shift force buffer to accumulate into
*/
GPU_FUNC_QUALIFIER
-void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_gpu_t gmx_unused *nb,
- int gmx_unused flags,
- int gmx_unused aloc,
- real gmx_unused *e_lj,
- real gmx_unused *e_el,
- rvec gmx_unused *fshift) GPU_FUNC_TERM
+void nbnxn_gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb,
+ int gmx_unused flags,
+ int gmx_unused aloc,
+ real gmx_unused *e_lj,
+ real gmx_unused *e_el,
+ rvec gmx_unused *fshift) GPU_FUNC_TERM
/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
GPU_FUNC_QUALIFIER
#include "nbnxn_ocl/nbnxn_ocl_types.h"
#endif
+#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/math/vec.h"
#include "gromacs/mdlib/nbnxn_gpu_types.h"
#include "gromacs/pbcutil/ishift.h"
* nonbonded tasks have completed with the exception of the rolling pruning kernels
* that are accounted for during the following step.
*
+ * NOTE: if timing with multiple GPUs (streams) becomes possible, the
+ * counters could end up being inconsistent due to not being incremented
+ * on some of the node when this is skipped on empty local domains!
+ *
* \tparam GpuTimers GPU timers type
* \tparam GpuPairlist Pair list type
* \param[out] timings Pointer to the NB GPU timings data
}
}
-// Documented in nbnxn_gpu.h
-void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_gpu_t *nb,
- int flags,
- int aloc,
- real *e_lj,
- real *e_el,
- rvec *fshift)
+bool nbnxn_gpu_try_finish_task(gmx_nbnxn_gpu_t *nb,
+ int flags,
+ int aloc,
+ real *e_lj,
+ real *e_el,
+ rvec *fshift,
+ GpuTaskCompletion completionKind)
{
/* determine interaction locality from atom locality */
int iLocality = gpuAtomToInteractionLocality(aloc);
- /* Launch wait/update timers & counters and do reduction into staging buffers
- BUT skip it when during the non-local phase there was actually no work to do.
- This is consistent with nbnxn_gpu_launch_kernel.
-
- NOTE: if timing with multiple GPUs (streams) becomes possible, the
- counters could end up being inconsistent due to not being incremented
- on some of the nodes! */
+ // We skip when during the non-local phase there was actually no work to do.
+ // This is consistent with nbnxn_gpu_launch_kernel.
if (!canSkipWork(nb, iLocality))
{
- gpuStreamSynchronize(nb->stream[iLocality]);
+ // Query the state of the GPU stream and return early if we're not done
+ if (completionKind == GpuTaskCompletion::Check)
+ {
+ if (!haveStreamTasksCompleted(nb->stream[iLocality]))
+ {
+ // Early return to skip the steps below that we have to do only
+ // after the NB task completed
+ return false;
+ }
+ }
+ else
+ {
+ gpuStreamSynchronize(nb->stream[iLocality]);
+ }
bool calcEner = flags & GMX_FORCE_ENERGY;
bool calcFshift = flags & GMX_FORCE_VIRIAL;
/* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
nb->plist[iLocality]->haveFreshList = false;
+
+ return true;
+}
+
+/*! \brief
+ * Wait for the asynchronously launched nonbonded tasks and data
+ * transfers to finish.
+ *
+ * Also does timing accounting and reduction of the internal staging buffers.
+ * As this is called at the end of the step, it also resets the pair list and
+ * pruning flags.
+ *
+ * \param[in] nb The nonbonded data GPU structure
+ * \param[in] flags Force flags
+ * \param[in] aloc Atom locality identifier
+ * \param[out] e_lj Pointer to the LJ energy output to accumulate into
+ * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
+ * \param[out] fshift Pointer to the shift force buffer to accumulate into
+ */
+void nbnxn_gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb,
+ int flags,
+ int aloc,
+ real *e_lj,
+ real *e_el,
+ rvec *fshift)
+{
+ nbnxn_gpu_try_finish_task(nb, flags, aloc, e_lj, e_el, fshift,
+ GpuTaskCompletion::Wait);
}
#endif
#include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
#include "gromacs/gmxlib/nonbonded/nb_kernel.h"
#include "gromacs/gmxlib/nonbonded/nonbonded.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
#include "gromacs/imd/imd.h"
#include "gromacs/listed-forces/bonded.h"
#include "gromacs/listed-forces/disre.h"
#include "nbnxn_kernels/nbnxn_kernel_cpu.h"
#include "nbnxn_kernels/nbnxn_kernel_prune.h"
+// TODO: this environment variable allows us to verify before release
+// that on less common architectures the total cost of polling is not larger than
+// a blocking wait (so polling does not introduce overhead when the static
+// PME-first ordering would suffice).
+static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
+
+
void print_time(FILE *out,
gmx_walltime_accounting_t walltime_accounting,
gmx_int64_t step,
pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
}
+/*! \brief
+ * Polling wait for either of the PME or nonbonded GPU tasks.
+ *
+ * Instead of a static order in waiting for GPU tasks, this function
+ * polls checking which of the two tasks completes first, and does the
+ * associated force buffer reduction overlapped with the other task.
+ * By doing that, unlike static scheduling order, it can always overlap
+ * one of the reductions, regardless of the GPU task completion order.
+ *
+ * \param[in] nbv Nonbonded verlet structure
+ * \param[in] pmedata PME module data
+ * \param[in,out] force Force array to reduce task outputs into.
+ * \param[in,out] forceWithVirial Force and virial buffers
+ * \param[in,out] fshift Shift force output vector results are reduced into
+ * \param[in,out] enerd Energy data structure results are reduced into
+ * \param[in] flags Force flags
+ * \param[in] wcycle The wallcycle structure
+ */
+static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
+ const gmx_pme_t *pmedata,
+ gmx::PaddedArrayRef<gmx::RVec> *force,
+ ForceWithVirial *forceWithVirial,
+ rvec fshift[],
+ gmx_enerdata_t *enerd,
+ int flags,
+ gmx_wallcycle_t wcycle)
+{
+ bool isPmeGpuDone = false;
+ bool isNbGpuDone = false;
+
+
+ gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
+
+ while (!isPmeGpuDone || !isNbGpuDone)
+ {
+ if (!isPmeGpuDone)
+ {
+ matrix vir_Q;
+ real Vlr_q;
+
+ GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
+ isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
+ vir_Q, &Vlr_q, completionType);
+
+ if (isPmeGpuDone)
+ {
+ pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
+ enerd, vir_Q, Vlr_q);
+ }
+ }
+
+ if (!isNbGpuDone)
+ {
+ GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
+ wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
+ isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
+ flags, eatLocal,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fshift, completionType);
+ wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+ // To get the call count right, when the task finished we
+ // issue a start/stop.
+ // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
+ // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
+ if (isNbGpuDone)
+ {
+ wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
+ wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
+ nbv->nbat, as_rvec_array(force->data()));
+ wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ }
+ }
+ }
+}
+
/*! \brief
* Launch the dynamic rolling pruning GPU task.
*
if (bUseGPU)
{
wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
- nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
- flags, eatNonlocal,
- enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
- fr->fshift);
+ nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
+ flags, eatNonlocal,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fr->fshift);
cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
}
else
}
}
- if (useGpuPme)
+ // With both nonbonded and PME offloaded a GPU on the same rank, we use
+ // an alternating wait/reduction scheme.
+ bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
+ if (alternateGpuWait)
+ {
+ alternatePmeNbGpuWaitReduce(fr->nbv, fr->pmedata, &force, &forceWithVirial, fr->fshift, enerd, flags, wcycle);
+ }
+
+ if (!alternateGpuWait && useGpuPme)
{
gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
matrix vir_Q;
real Vlr_q;
- pme_gpu_wait_for_gpu(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
+ pme_gpu_wait_finish_task(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
}
- /* Wait for local NB forces */
- if (bUseGPU)
+ /* Wait for local GPU NB outputs on the non-alternating wait path */
+ if (!alternateGpuWait && bUseGPU)
{
/* Measured overhead on CUDA and OpenCL with(out) GPU sharing
* is between 0.5 and 1.5 Mcycles. So 2 MCycles is an overestimate,
const float gpuWaitApiOverheadMargin = 2e6f; /* cycles */
wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
- nbnxn_gpu_wait_for_gpu(nbv->gpu_nbv,
- flags, eatLocal,
- enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
- fr->fshift);
+ nbnxn_gpu_wait_finish_task(nbv->gpu_nbv,
+ flags, eatLocal,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fr->fshift);
float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
if (ddCloseBalanceRegion == DdCloseBalanceRegionAfterForceComputation::yes)
// TODO: move here the PME buffer clearing call pme_gpu_reinit_computation()
}
- /* Do the nonbonded GPU (or emulation) force buffer reduction. */
- if (bUseOrEmulGPU)
+ /* Do the nonbonded GPU (or emulation) force buffer reduction
+ * on the non-alternating path. */
+ if (bUseOrEmulGPU && !alternateGpuWait)
{
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);