+/*! \brief
+ * Polling wait for either of the PME or nonbonded GPU tasks.
+ *
+ * Instead of a static order in waiting for GPU tasks, this function
+ * polls checking which of the two tasks completes first, and does the
+ * associated force buffer reduction overlapped with the other task.
+ * By doing that, unlike static scheduling order, it can always overlap
+ * one of the reductions, regardless of the GPU task completion order.
+ *
+ * \param[in] nbv Nonbonded verlet structure
+ * \param[in] pmedata PME module data
+ * \param[in,out] force Force array to reduce task outputs into.
+ * \param[in,out] forceWithVirial Force and virial buffers
+ * \param[in,out] fshift Shift force output vector results are reduced into
+ * \param[in,out] enerd Energy data structure results are reduced into
+ * \param[in] flags Force flags
+ * \param[in] wcycle The wallcycle structure
+ */
+static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
+ const gmx_pme_t *pmedata,
+ gmx::PaddedArrayRef<gmx::RVec> *force,
+ ForceWithVirial *forceWithVirial,
+ rvec fshift[],
+ gmx_enerdata_t *enerd,
+ int flags,
+ gmx_wallcycle_t wcycle)
+{
+ bool isPmeGpuDone = false;
+ bool isNbGpuDone = false;
+
+
+ gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
+
+ while (!isPmeGpuDone || !isNbGpuDone)
+ {
+ if (!isPmeGpuDone)
+ {
+ matrix vir_Q;
+ real Vlr_q;
+
+ GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
+ isPmeGpuDone = pme_gpu_try_finish_task(pmedata, wcycle, &pmeGpuForces,
+ vir_Q, &Vlr_q, completionType);
+
+ if (isPmeGpuDone)
+ {
+ pme_gpu_reduce_outputs(wcycle, forceWithVirial, pmeGpuForces,
+ enerd, vir_Q, Vlr_q);
+ }
+ }
+
+ if (!isNbGpuDone)
+ {
+ GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
+ wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
+ isNbGpuDone = nbnxn_gpu_try_finish_task(nbv->gpu_nbv,
+ flags, eatLocal,
+ enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
+ fshift, completionType);
+ wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+ // To get the call count right, when the task finished we
+ // issue a start/stop.
+ // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
+ // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
+ if (isNbGpuDone)
+ {
+ wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
+ wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ nbnxn_atomdata_add_nbat_f_to_f(nbv->nbs, eatLocal,
+ nbv->nbat, as_rvec_array(force->data()));
+ wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ }
+ }
+ }
+}
+