GMX_ASSERT(stat == cudaSuccess,
("cudaStreamWaitEvent failed. " + gmx::getDeviceErrorString(stat)).c_str());
}
+ //! Reset the event (not needed in CUDA)
+ inline void reset() {}
private:
cudaEvent_t event_;
+ ocl_get_error_string(clError)));
}
- releaseEvent();
+ reset();
}
/*! \brief Checks the completion of the underlying event and resets the object if it was. */
inline bool isReady()
bool hasTriggered = (result == CL_COMPLETE);
if (hasTriggered)
{
- releaseEvent();
+ reset();
}
return hasTriggered;
}
+ ocl_get_error_string(clError)));
}
- releaseEvent();
+ reset();
}
-private:
- inline void releaseEvent()
+ //! Reset (release) the event to unmarked state.
+ inline void reset()
{
cl_int clError = clReleaseEvent(event_);
if (CL_SUCCESS != clError)
event_ = nullptr;
}
+private:
cl_event event_;
};
deviceStream.stream().submit_barrier(waitlist);
event_.reset();
}
+ //! Reset the event to unmarked state.
+ inline void reset() { event_.reset(); }
private:
std::optional<cl::sycl::event> event_ = std::nullopt;
* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
return shmem;
}
-/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
- *
- * As the point where the local stream tasks can be considered complete happens
- * at the same call point where the nonlocal stream should be synced with the
- * the local, this function records the event if called with the local stream as
- * argument and inserts in the GPU stream a wait on the event on the nonlocal.
- */
-void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocality interactionLocality)
+void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
{
const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
{
if (interactionLocality == InteractionLocality::Local)
{
- cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, deviceStream.stream());
- CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed");
+ nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
}
else
{
- cudaError_t stat =
- cudaStreamWaitEvent(deviceStream.stream(), nb->misc_ops_and_local_H2D_done, 0);
- CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed");
+ nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
}
}
}
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
- GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal,
- "Only local and non-local xq transfers are supported");
-
const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
{
plist->haveFreshList = false;
+ // The event is marked for Local interactions unconditionally,
+ // so it has to be released here because of the early return
+ // for NonLocal interactions.
+ nb->misc_ops_and_local_H2D_done.reset();
+
return;
}
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
- cudaError_t stat;
- int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
+ int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
/* determine interaction locality from atom locality */
const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
+ GMX_ASSERT(iloc == InteractionLocality::Local
+ || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
+ "Non-local stream is indicating that the copy back event is enqueued at the "
+ "beginning of the copy back function.");
/* extract the data */
cu_atomdata_t* adat = nb->atdat;
/* don't launch non-local copy-back if there was no non-local work to do */
if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
{
+ nb->bNonLocalStreamDoneMarked = false;
return;
}
/* With DD the local D2H transfer can only start after the non-local
kernel has finished. */
- if (iloc == InteractionLocality::Local && nb->bUseTwoStreams)
+ if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
{
- stat = cudaStreamWaitEvent(deviceStream.stream(), nb->nonlocal_done, 0);
- CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
+ nb->nonlocal_done.enqueueWaitEvent(deviceStream);
+ nb->bNonLocalStreamDoneMarked = false;
}
/* DtoH f
back first. */
if (iloc == InteractionLocality::NonLocal)
{
- stat = cudaEventRecord(nb->nonlocal_done, deviceStream.stream());
- CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed");
+ nb->nonlocal_done.markEvent(deviceStream);
+ nb->bNonLocalStreamDoneMarked = true;
}
/* only transfer energies in the local stream */
launchGpuKernel(kernelFn, config, deviceStream, nullptr, "XbufferOps", kernelArgs);
}
- // TODO: note that this is not necessary when there astreamre no local atoms, that is:
+ // TODO: note that this is not necessary when there are no local atoms, that is:
// (numAtoms == 0 && interactionLoc == InteractionLocality::Local)
// but for now we avoid that optimization
nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
const nbnxn_atomdata_t* nbat,
bool bLocalAndNonlocal)
{
- cudaError_t stat;
-
auto nb = new NbnxmGpu();
nb->deviceContext_ = &deviceStreamManager.context();
snew(nb->atdat, 1);
;
}
- /* init events for sychronization (timing disabled for performance reasons!) */
- stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming);
- CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed");
- stat = cudaEventCreateWithFlags(&nb->misc_ops_and_local_H2D_done, cudaEventDisableTiming);
- CU_RET_ERR(stat, "cudaEventCreate on misc_ops_and_local_H2D_done failed");
-
- nb->xNonLocalCopyD2HDone = new GpuEventSynchronizer();
-
/* WARNING: CUDA timings are incorrect with multiple streams.
* This is the main reason why they are disabled by default.
*/
void gpu_free(NbnxmGpu* nb)
{
- cudaError_t stat;
cu_atomdata_t* atdat;
NBParamGpu* nbparam;
destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
}
- stat = cudaEventDestroy(nb->nonlocal_done);
- CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done");
- stat = cudaEventDestroy(nb->misc_ops_and_local_H2D_done);
- CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done");
-
delete nb->timers;
if (!useLjCombRule(nb->nbparam->vdwType))
}
}
- // The above data is transferred on the local stream but is a
- // dependency of the nonlocal stream (specifically the nonlocal X
- // buf ops kernel). We therefore set a dependency to ensure
- // that the nonlocal stream waits on the local stream here.
- // This call records an event in the local stream:
- nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
- // ...and this call instructs the nonlocal stream to wait on that event:
- nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+ if (gpu_nbv->bUseTwoStreams)
+ {
+ // The above data is transferred on the local stream but is a
+ // dependency of the nonlocal stream (specifically the nonlocal X
+ // buf ops kernel). We therefore set a dependency to ensure
+ // that the nonlocal stream waits on the local stream here.
+ // This call records an event in the local stream:
+ gpu_nbv->misc_ops_and_local_H2D_done.markEvent(
+ *gpu_nbv->deviceStreams[Nbnxm::InteractionLocality::Local]);
+ // ...and this call instructs the nonlocal stream to wait on that event:
+ gpu_nbv->misc_ops_and_local_H2D_done.enqueueWaitEvent(
+ *gpu_nbv->deviceStreams[Nbnxm::InteractionLocality::NonLocal]);
+ }
return;
}
*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2012, The GROMACS development team.
- * Copyright (c) 2013-2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2013-2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "gromacs/gpu_utils/cudautils.cuh"
#include "gromacs/gpu_utils/devicebuffer.h"
#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
#include "gromacs/gpu_utils/gputraits.cuh"
#include "gromacs/mdtypes/interaction_const.h"
#include "gromacs/nbnxm/gpu_types_common.h"
*/
typedef struct Nbnxm::gpu_timers_t cu_timers_t;
-class GpuEventSynchronizer;
-
/*! \internal
* \brief Main data structure for CUDA nonbonded force calculations.
*/
const DeviceContext* deviceContext_;
/*! \brief true if doing both local/non-local NB work on GPU */
bool bUseTwoStreams = false;
+ //! true indicates that the nonlocal_done event was marked
+ bool bNonLocalStreamDoneMarked = false;
+
/*! \brief atom data */
cu_atomdata_t* atdat = nullptr;
/*! \brief array of atom indices */
/*! \brief local and non-local GPU streams */
gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
- /*! \brief Events used for synchronization */
- /*! \{ */
/*! \brief Event triggered when the non-local non-bonded
* kernel is done (and the local transfer can proceed) */
- cudaEvent_t nonlocal_done = nullptr;
+ GpuEventSynchronizer nonlocal_done;
/*! \brief Event triggered when the tasks issued in the local
* stream that need to precede the non-local force or buffer
* operation calculations are done (e.g. f buffer 0-ing, local
* x/q H2D, buffer op initialization in local stream that is
* required also by nonlocal stream ) */
- cudaEvent_t misc_ops_and_local_H2D_done = nullptr;
- /*! \} */
+ GpuEventSynchronizer misc_ops_and_local_H2D_done;
/*! \brief True if there is work for the current domain in the
* respective locality.
* will be true. */
gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork = { { false } };
- /*! \brief Event triggered when non-local coordinate buffer
- * has been copied from device to host. */
- GpuEventSynchronizer* xNonLocalCopyD2HDone = nullptr;
-
/* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple
* concurrent streams, so we won't time if both l/nl work is done on GPUs.
* Timer init/uninit is still done even with timing off so only the condition
Nbnxm::nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(), gpu_nbv);
}
-void nonbonded_verlet_t::insertNonlocalGpuDependency(const gmx::InteractionLocality interactionLocality) const
-{
- Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
-}
-
/*! \endcond */
//! Init for GPU version of setup coordinates in Nbnxm
void atomdata_init_copy_x_to_nbat_x_gpu() const;
- //! Sync the nonlocal GPU stream with dependent tasks in the local queue.
- void insertNonlocalGpuDependency(gmx::InteractionLocality interactionLocality) const;
-
//! Returns a reference to the pairlist sets
const PairlistSets& pairlistSets() const { return *pairlistSets_; }
* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 2012,2013,2014,2015,2017 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
int gmx_unused numColumnsMax) CUDA_FUNC_TERM;
/*! \brief Sync the nonlocal stream with dependent tasks in the local queue.
+ *
+ * As the point where the local stream tasks can be considered complete happens
+ * at the same call point where the nonlocal stream should be synced with the
+ * the local, this function records the event if called with the local stream as
+ * argument and inserts in the GPU stream a wait on the event on the nonlocal.
+ *
* \param[in] nb The nonbonded data GPU structure
* \param[in] interactionLocality Local or NonLocal sync point
*/
-CUDA_FUNC_QUALIFIER
-void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu gmx_unused* nb,
- gmx::InteractionLocality gmx_unused interactionLocality) CUDA_FUNC_TERM;
+GPU_FUNC_QUALIFIER
+void nbnxnInsertNonlocalGpuDependency(NbnxmGpu gmx_unused* nb,
+ gmx::InteractionLocality gmx_unused interactionLocality) GPU_FUNC_TERM;
/*! \brief Set up internal flags that indicate what type of short-range work there is.
*
nbparams_params->vdw_switch = nbp->vdw_switch;
}
-/*! \brief Enqueues a wait for event completion.
- *
- * Then it releases the event and sets it to 0.
- * Don't use this function when more than one wait will be issued for the event.
- * Equivalent to Cuda Stream Sync. */
-static void sync_ocl_event(cl_command_queue stream, cl_event* ocl_event)
+void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
{
- cl_int gmx_unused cl_error;
+ const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
- /* Enqueue wait */
- cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, nullptr);
- GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
+ /* When we get here all misc operations issued in the local stream as well as
+ the local xq H2D are done,
+ so we record that in the local stream and wait for it in the nonlocal one.
+ This wait needs to precede any PP tasks, bonded or nonbonded, that may
+ compute on interactions between local and nonlocal atoms.
+ */
+ if (nb->bUseTwoStreams)
+ {
+ if (interactionLocality == InteractionLocality::Local)
+ {
+ nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
- /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
- cl_error = clReleaseEvent(*ocl_event);
- GMX_ASSERT(cl_error == CL_SUCCESS,
- ("clReleaseEvent failed: " + ocl_get_error_string(cl_error)).c_str());
- *ocl_event = nullptr;
+ /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
+ * in the local stream in order to be able to sync with the above event
+ * from the non-local stream.
+ */
+ cl_int gmx_used_in_debug cl_error = clFlush(deviceStream.stream());
+ GMX_ASSERT(cl_error == CL_SUCCESS,
+ ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
+ }
+ else
+ {
+ nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
+ }
+ }
}
/*! \brief Launch asynchronously the xq buffer host to device copy. */
{
plist->haveFreshList = false;
+ // The event is marked for Local interactions unconditionally,
+ // so it has to be released here because of the early return
+ // for NonLocal interactions.
+ nb->misc_ops_and_local_H2D_done.reset();
+
return;
}
t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
}
- /* When we get here all misc operations issues in the local stream as well as
+ /* When we get here all misc operations issued in the local stream as well as
the local xq H2D are done,
- so we record that in the local stream and wait for it in the nonlocal one. */
- if (nb->bUseTwoStreams)
- {
- if (iloc == InteractionLocality::Local)
- {
- cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(
- deviceStream.stream(), 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
- GMX_ASSERT(cl_error == CL_SUCCESS,
- ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
-
- /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
- * in the local stream in order to be able to sync with the above event
- * from the non-local stream.
- */
- cl_error = clFlush(deviceStream.stream());
- GMX_ASSERT(cl_error == CL_SUCCESS,
- ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
- }
- else
- {
- sync_ocl_event(deviceStream.stream(), &(nb->misc_ops_and_local_H2D_done));
- }
- }
+ so we record that in the local stream and wait for it in the nonlocal one.
+ This wait needs to precede any PP tasks, bonded or nonbonded, that may
+ compute on interactions between local and nonlocal atoms.
+ */
+ nbnxnInsertNonlocalGpuDependency(nb, iloc);
}
/* determine interaction locality from atom locality */
const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
+ GMX_ASSERT(iloc == InteractionLocality::Local
+ || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
+ "Non-local stream is indicating that the copy back event is enqueued at the "
+ "beginning of the copy back function.");
cl_atomdata_t* adat = nb->atdat;
cl_timers_t* t = nb->timers;
{
/* TODO An alternative way to signal that non-local work is
complete is to use a clEnqueueMarker+clEnqueueBarrier
- pair. However, the use of bNonLocalStreamActive has the
+ pair. However, the use of bNonLocalStreamDoneMarked has the
advantage of being local to the host, so probably minimizes
overhead. Curiously, for NVIDIA OpenCL with an empty-domain
test case, overall simulation performance was higher with
the API calls, but this has not been tested on AMD OpenCL,
so could be worth considering in future. */
- nb->bNonLocalStreamActive = CL_FALSE;
+ nb->bNonLocalStreamDoneMarked = false;
return;
}
/* With DD the local D2H transfer can only start after the non-local
has been launched. */
- if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
+ if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
{
- sync_ocl_event(deviceStream.stream(), &(nb->nonlocal_done));
+ nb->nonlocal_done.enqueueWaitEvent(deviceStream);
+ nb->bNonLocalStreamDoneMarked = false;
}
/* DtoH f */
data back first. */
if (iloc == InteractionLocality::NonLocal)
{
- cl_error = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &(nb->nonlocal_done));
- GMX_ASSERT(cl_error == CL_SUCCESS,
- ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
- nb->bNonLocalStreamActive = CL_TRUE;
+ nb->nonlocal_done.markEvent(deviceStream);
+ nb->bNonLocalStreamDoneMarked = true;
}
/* only transfer energies in the local stream */
pfree(nb->nbst.fshift);
nb->nbst.fshift = nullptr;
- /* Free other events */
- if (nb->nonlocal_done)
- {
- clReleaseEvent(nb->nonlocal_done);
- nb->nonlocal_done = nullptr;
- }
- if (nb->misc_ops_and_local_H2D_done)
- {
- clReleaseEvent(nb->misc_ops_and_local_H2D_done);
- nb->misc_ops_and_local_H2D_done = nullptr;
- }
-
freeGpuProgram(nb->dev_rundata->program);
delete nb->dev_rundata;
* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "gromacs/gpu_utils/devicebuffer.h"
#include "gromacs/gpu_utils/gmxopencl.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer_ocl.h"
#include "gromacs/gpu_utils/gputraits_ocl.h"
#include "gromacs/gpu_utils/oclutils.h"
#include "gromacs/mdtypes/interaction_const.h"
//! true if doing both local/non-local NB work on GPU
bool bUseTwoStreams = false;
- //! true indicates that the nonlocal_done event was enqueued
- bool bNonLocalStreamActive = false;
+ //! true indicates that the nonlocal_done event was marked
+ bool bNonLocalStreamDoneMarked = false;
//! atom data
cl_atomdata_t* atdat = nullptr;
/*! \{ */
/*! \brief Event triggered when the non-local non-bonded
* kernel is done (and the local transfer can proceed) */
- cl_event nonlocal_done = nullptr;
+ GpuEventSynchronizer nonlocal_done;
/*! \brief Event triggered when the tasks issued in the local
* stream that need to precede the non-local force or buffer
* operation calculations are done (e.g. f buffer 0-ing, local
* x/q H2D, buffer op initialization in local stream that is
* required also by nonlocal stream ) */
- cl_event misc_ops_and_local_H2D_done = nullptr;
+ GpuEventSynchronizer misc_ops_and_local_H2D_done;
/*! \} */
//! True if there has been local/nonlocal GPU work, either bonded or nonbonded, scheduled