From: Artem Zhmurov Date: Wed, 17 Feb 2021 10:24:31 +0000 (+0000) Subject: Use GpuEventSynchronizer in NBNXM X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=21c93104cd8b2f6da24b8037e3aad995211a1a04;p=alexxy%2Fgromacs.git Use GpuEventSynchronizer in NBNXM This unifies the event-based synchronization in CUDA and OpenCL by using a wrapper with platform-agnostic interfaces for events. Refs #2608 --- diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh index b098f35498..f40c7d446f 100644 --- a/src/gromacs/gpu_utils/gpueventsynchronizer.cuh +++ b/src/gromacs/gpu_utils/gpueventsynchronizer.cuh @@ -114,6 +114,8 @@ public: GMX_ASSERT(stat == cudaSuccess, ("cudaStreamWaitEvent failed. " + gmx::getDeviceErrorString(stat)).c_str()); } + //! Reset the event (not needed in CUDA) + inline void reset() {} private: cudaEvent_t event_; diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h b/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h index 2b42308e84..0b9905450f 100644 --- a/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h +++ b/src/gromacs/gpu_utils/gpueventsynchronizer_ocl.h @@ -106,7 +106,7 @@ public: + ocl_get_error_string(clError))); } - releaseEvent(); + reset(); } /*! \brief Checks the completion of the underlying event and resets the object if it was. */ inline bool isReady() @@ -121,7 +121,7 @@ public: bool hasTriggered = (result == CL_COMPLETE); if (hasTriggered) { - releaseEvent(); + reset(); } return hasTriggered; } @@ -139,11 +139,11 @@ public: + ocl_get_error_string(clError))); } - releaseEvent(); + reset(); } -private: - inline void releaseEvent() + //! Reset (release) the event to unmarked state. + inline void reset() { cl_int clError = clReleaseEvent(event_); if (CL_SUCCESS != clError) @@ -154,6 +154,7 @@ private: event_ = nullptr; } +private: cl_event event_; }; diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h b/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h index 6740cc3a3d..cdbed7cc1b 100644 --- a/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h +++ b/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h @@ -131,6 +131,8 @@ public: deviceStream.stream().submit_barrier(waitlist); event_.reset(); } + //! Reset the event to unmarked state. + inline void reset() { event_.reset(); } private: std::optional event_ = std::nullopt; diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index d269ac27bb..aa5652011b 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -2,7 +2,7 @@ * This file is part of the GROMACS molecular simulation package. * * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team. - * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -424,14 +424,7 @@ static inline int calc_shmem_required_nonbonded(const int num_thre return shmem; } -/*! \brief Sync the nonlocal stream with dependent tasks in the local queue. - * - * As the point where the local stream tasks can be considered complete happens - * at the same call point where the nonlocal stream should be synced with the - * the local, this function records the event if called with the local stream as - * argument and inserts in the GPU stream a wait on the event on the nonlocal. - */ -void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocality interactionLocality) +void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality) { const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality]; @@ -445,14 +438,11 @@ void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocal { if (interactionLocality == InteractionLocality::Local) { - cudaError_t stat = cudaEventRecord(nb->misc_ops_and_local_H2D_done, deviceStream.stream()); - CU_RET_ERR(stat, "cudaEventRecord on misc_ops_and_local_H2D_done failed"); + nb->misc_ops_and_local_H2D_done.markEvent(deviceStream); } else { - cudaError_t stat = - cudaStreamWaitEvent(deviceStream.stream(), nb->misc_ops_and_local_H2D_done, 0); - CU_RET_ERR(stat, "cudaStreamWaitEvent on misc_ops_and_local_H2D_done failed"); + nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream); } } } @@ -462,9 +452,6 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom { GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - GMX_ASSERT(atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal, - "Only local and non-local xq transfers are supported"); - const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ @@ -489,6 +476,11 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom { plist->haveFreshList = false; + // The event is marked for Local interactions unconditionally, + // so it has to be released here because of the early return + // for NonLocal interactions. + nb->misc_ops_and_local_H2D_done.reset(); + return; } @@ -809,11 +801,14 @@ void gpu_launch_cpyback(NbnxmGpu* nb, { GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - cudaError_t stat; - int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ + int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ /* determine interaction locality from atom locality */ const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); + GMX_ASSERT(iloc == InteractionLocality::Local + || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false), + "Non-local stream is indicating that the copy back event is enqueued at the " + "beginning of the copy back function."); /* extract the data */ cu_atomdata_t* adat = nb->atdat; @@ -824,6 +819,7 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* don't launch non-local copy-back if there was no non-local work to do */ if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) { + nb->bNonLocalStreamDoneMarked = false; return; } @@ -837,10 +833,10 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* With DD the local D2H transfer can only start after the non-local kernel has finished. */ - if (iloc == InteractionLocality::Local && nb->bUseTwoStreams) + if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked) { - stat = cudaStreamWaitEvent(deviceStream.stream(), nb->nonlocal_done, 0); - CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed"); + nb->nonlocal_done.enqueueWaitEvent(deviceStream); + nb->bNonLocalStreamDoneMarked = false; } /* DtoH f @@ -866,8 +862,8 @@ void gpu_launch_cpyback(NbnxmGpu* nb, back first. */ if (iloc == InteractionLocality::NonLocal) { - stat = cudaEventRecord(nb->nonlocal_done, deviceStream.stream()); - CU_RET_ERR(stat, "cudaEventRecord on nonlocal_done failed"); + nb->nonlocal_done.markEvent(deviceStream); + nb->bNonLocalStreamDoneMarked = true; } /* only transfer energies in the local stream */ @@ -985,7 +981,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, launchGpuKernel(kernelFn, config, deviceStream, nullptr, "XbufferOps", kernelArgs); } - // TODO: note that this is not necessary when there astreamre no local atoms, that is: + // TODO: note that this is not necessary when there are no local atoms, that is: // (numAtoms == 0 && interactionLoc == InteractionLocality::Local) // but for now we avoid that optimization nbnxnInsertNonlocalGpuDependency(nb, interactionLoc); diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index e442fde24f..7d1334144f 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -182,8 +182,6 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, const nbnxn_atomdata_t* nbat, bool bLocalAndNonlocal) { - cudaError_t stat; - auto nb = new NbnxmGpu(); nb->deviceContext_ = &deviceStreamManager.context(); snew(nb->atdat, 1); @@ -227,14 +225,6 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, ; } - /* init events for sychronization (timing disabled for performance reasons!) */ - stat = cudaEventCreateWithFlags(&nb->nonlocal_done, cudaEventDisableTiming); - CU_RET_ERR(stat, "cudaEventCreate on nonlocal_done failed"); - stat = cudaEventCreateWithFlags(&nb->misc_ops_and_local_H2D_done, cudaEventDisableTiming); - CU_RET_ERR(stat, "cudaEventCreate on misc_ops_and_local_H2D_done failed"); - - nb->xNonLocalCopyD2HDone = new GpuEventSynchronizer(); - /* WARNING: CUDA timings are incorrect with multiple streams. * This is the main reason why they are disabled by default. */ @@ -409,7 +399,6 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) void gpu_free(NbnxmGpu* nb) { - cudaError_t stat; cu_atomdata_t* atdat; NBParamGpu* nbparam; @@ -427,11 +416,6 @@ void gpu_free(NbnxmGpu* nb) destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj); } - stat = cudaEventDestroy(nb->nonlocal_done); - CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done"); - stat = cudaEventDestroy(nb->misc_ops_and_local_H2D_done); - CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done"); - delete nb->timers; if (!useLjCombRule(nb->nbparam->vdwType)) @@ -610,14 +594,19 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv } } - // The above data is transferred on the local stream but is a - // dependency of the nonlocal stream (specifically the nonlocal X - // buf ops kernel). We therefore set a dependency to ensure - // that the nonlocal stream waits on the local stream here. - // This call records an event in the local stream: - nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local); - // ...and this call instructs the nonlocal stream to wait on that event: - nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal); + if (gpu_nbv->bUseTwoStreams) + { + // The above data is transferred on the local stream but is a + // dependency of the nonlocal stream (specifically the nonlocal X + // buf ops kernel). We therefore set a dependency to ensure + // that the nonlocal stream waits on the local stream here. + // This call records an event in the local stream: + gpu_nbv->misc_ops_and_local_H2D_done.markEvent( + *gpu_nbv->deviceStreams[Nbnxm::InteractionLocality::Local]); + // ...and this call instructs the nonlocal stream to wait on that event: + gpu_nbv->misc_ops_and_local_H2D_done.enqueueWaitEvent( + *gpu_nbv->deviceStreams[Nbnxm::InteractionLocality::NonLocal]); + } return; } diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index 7c92a1abdc..1cd1606de3 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2012, The GROMACS development team. - * Copyright (c) 2013-2019,2020, by the GROMACS development team, led by + * Copyright (c) 2013-2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -50,6 +50,7 @@ #include "gromacs/gpu_utils/cudautils.cuh" #include "gromacs/gpu_utils/devicebuffer.h" #include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gromacs/gpu_utils/gpueventsynchronizer.cuh" #include "gromacs/gpu_utils/gputraits.cuh" #include "gromacs/mdtypes/interaction_const.h" #include "gromacs/nbnxm/gpu_types_common.h" @@ -142,8 +143,6 @@ struct cu_atomdata */ typedef struct Nbnxm::gpu_timers_t cu_timers_t; -class GpuEventSynchronizer; - /*! \internal * \brief Main data structure for CUDA nonbonded force calculations. */ @@ -156,6 +155,9 @@ struct NbnxmGpu const DeviceContext* deviceContext_; /*! \brief true if doing both local/non-local NB work on GPU */ bool bUseTwoStreams = false; + //! true indicates that the nonlocal_done event was marked + bool bNonLocalStreamDoneMarked = false; + /*! \brief atom data */ cu_atomdata_t* atdat = nullptr; /*! \brief array of atom indices */ @@ -185,18 +187,15 @@ struct NbnxmGpu /*! \brief local and non-local GPU streams */ gmx::EnumerationArray deviceStreams; - /*! \brief Events used for synchronization */ - /*! \{ */ /*! \brief Event triggered when the non-local non-bonded * kernel is done (and the local transfer can proceed) */ - cudaEvent_t nonlocal_done = nullptr; + GpuEventSynchronizer nonlocal_done; /*! \brief Event triggered when the tasks issued in the local * stream that need to precede the non-local force or buffer * operation calculations are done (e.g. f buffer 0-ing, local * x/q H2D, buffer op initialization in local stream that is * required also by nonlocal stream ) */ - cudaEvent_t misc_ops_and_local_H2D_done = nullptr; - /*! \} */ + GpuEventSynchronizer misc_ops_and_local_H2D_done; /*! \brief True if there is work for the current domain in the * respective locality. @@ -208,10 +207,6 @@ struct NbnxmGpu * will be true. */ gmx::EnumerationArray haveWork = { { false } }; - /*! \brief Event triggered when non-local coordinate buffer - * has been copied from device to host. */ - GpuEventSynchronizer* xNonLocalCopyD2HDone = nullptr; - /* NOTE: With current CUDA versions (<=5.0) timing doesn't work with multiple * concurrent streams, so we won't time if both l/nl work is done on GPUs. * Timer init/uninit is still done even with timing off so only the condition diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 97827a6367..96714bb154 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -249,9 +249,4 @@ void nonbonded_verlet_t::atomdata_init_copy_x_to_nbat_x_gpu() const Nbnxm::nbnxn_gpu_init_x_to_nbat_x(pairSearch_->gridSet(), gpu_nbv); } -void nonbonded_verlet_t::insertNonlocalGpuDependency(const gmx::InteractionLocality interactionLocality) const -{ - Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality); -} - /*! \endcond */ diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index b7c3101799..960e6c2b39 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -345,9 +345,6 @@ public: //! Init for GPU version of setup coordinates in Nbnxm void atomdata_init_copy_x_to_nbat_x_gpu() const; - //! Sync the nonlocal GPU stream with dependent tasks in the local queue. - void insertNonlocalGpuDependency(gmx::InteractionLocality interactionLocality) const; - //! Returns a reference to the pairlist sets const PairlistSets& pairlistSets() const { return *pairlistSets_; } diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index 034b0f6562..fe83696934 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -2,7 +2,7 @@ * This file is part of the GROMACS molecular simulation package. * * Copyright (c) 2012,2013,2014,2015,2017 by the GROMACS development team. - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -260,12 +260,18 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid, int gmx_unused numColumnsMax) CUDA_FUNC_TERM; /*! \brief Sync the nonlocal stream with dependent tasks in the local queue. + * + * As the point where the local stream tasks can be considered complete happens + * at the same call point where the nonlocal stream should be synced with the + * the local, this function records the event if called with the local stream as + * argument and inserts in the GPU stream a wait on the event on the nonlocal. + * * \param[in] nb The nonbonded data GPU structure * \param[in] interactionLocality Local or NonLocal sync point */ -CUDA_FUNC_QUALIFIER -void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu gmx_unused* nb, - gmx::InteractionLocality gmx_unused interactionLocality) CUDA_FUNC_TERM; +GPU_FUNC_QUALIFIER +void nbnxnInsertNonlocalGpuDependency(NbnxmGpu gmx_unused* nb, + gmx::InteractionLocality gmx_unused interactionLocality) GPU_FUNC_TERM; /*! \brief Set up internal flags that indicate what type of short-range work there is. * diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index 165079bce4..16cd265d84 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -489,24 +489,35 @@ static void fillin_ocl_structures(NBParamGpu* nbp, cl_nbparam_params_t* nbparams nbparams_params->vdw_switch = nbp->vdw_switch; } -/*! \brief Enqueues a wait for event completion. - * - * Then it releases the event and sets it to 0. - * Don't use this function when more than one wait will be issued for the event. - * Equivalent to Cuda Stream Sync. */ -static void sync_ocl_event(cl_command_queue stream, cl_event* ocl_event) +void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality) { - cl_int gmx_unused cl_error; + const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality]; - /* Enqueue wait */ - cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, nullptr); - GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str()); + /* When we get here all misc operations issued in the local stream as well as + the local xq H2D are done, + so we record that in the local stream and wait for it in the nonlocal one. + This wait needs to precede any PP tasks, bonded or nonbonded, that may + compute on interactions between local and nonlocal atoms. + */ + if (nb->bUseTwoStreams) + { + if (interactionLocality == InteractionLocality::Local) + { + nb->misc_ops_and_local_H2D_done.markEvent(deviceStream); - /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */ - cl_error = clReleaseEvent(*ocl_event); - GMX_ASSERT(cl_error == CL_SUCCESS, - ("clReleaseEvent failed: " + ocl_get_error_string(cl_error)).c_str()); - *ocl_event = nullptr; + /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed + * in the local stream in order to be able to sync with the above event + * from the non-local stream. + */ + cl_int gmx_used_in_debug cl_error = clFlush(deviceStream.stream()); + GMX_ASSERT(cl_error == CL_SUCCESS, + ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str()); + } + else + { + nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream); + } + } } /*! \brief Launch asynchronously the xq buffer host to device copy. */ @@ -539,6 +550,11 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom { plist->haveFreshList = false; + // The event is marked for Local interactions unconditionally, + // so it has to be released here because of the early return + // for NonLocal interactions. + nb->misc_ops_and_local_H2D_done.reset(); + return; } @@ -576,31 +592,13 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream); } - /* When we get here all misc operations issues in the local stream as well as + /* When we get here all misc operations issued in the local stream as well as the local xq H2D are done, - so we record that in the local stream and wait for it in the nonlocal one. */ - if (nb->bUseTwoStreams) - { - if (iloc == InteractionLocality::Local) - { - cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList( - deviceStream.stream(), 0, nullptr, &(nb->misc_ops_and_local_H2D_done)); - GMX_ASSERT(cl_error == CL_SUCCESS, - ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str()); - - /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed - * in the local stream in order to be able to sync with the above event - * from the non-local stream. - */ - cl_error = clFlush(deviceStream.stream()); - GMX_ASSERT(cl_error == CL_SUCCESS, - ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str()); - } - else - { - sync_ocl_event(deviceStream.stream(), &(nb->misc_ops_and_local_H2D_done)); - } - } + so we record that in the local stream and wait for it in the nonlocal one. + This wait needs to precede any PP tasks, bonded or nonbonded, that may + compute on interactions between local and nonlocal atoms. + */ + nbnxnInsertNonlocalGpuDependency(nb, iloc); } @@ -944,6 +942,10 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* determine interaction locality from atom locality */ const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc); + GMX_ASSERT(iloc == InteractionLocality::Local + || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false), + "Non-local stream is indicating that the copy back event is enqueued at the " + "beginning of the copy back function."); cl_atomdata_t* adat = nb->atdat; cl_timers_t* t = nb->timers; @@ -955,13 +957,13 @@ void gpu_launch_cpyback(NbnxmGpu* nb, { /* TODO An alternative way to signal that non-local work is complete is to use a clEnqueueMarker+clEnqueueBarrier - pair. However, the use of bNonLocalStreamActive has the + pair. However, the use of bNonLocalStreamDoneMarked has the advantage of being local to the host, so probably minimizes overhead. Curiously, for NVIDIA OpenCL with an empty-domain test case, overall simulation performance was higher with the API calls, but this has not been tested on AMD OpenCL, so could be worth considering in future. */ - nb->bNonLocalStreamActive = CL_FALSE; + nb->bNonLocalStreamDoneMarked = false; return; } @@ -975,9 +977,10 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* With DD the local D2H transfer can only start after the non-local has been launched. */ - if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive) + if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked) { - sync_ocl_event(deviceStream.stream(), &(nb->nonlocal_done)); + nb->nonlocal_done.enqueueWaitEvent(deviceStream); + nb->bNonLocalStreamDoneMarked = false; } /* DtoH f */ @@ -1001,10 +1004,8 @@ void gpu_launch_cpyback(NbnxmGpu* nb, data back first. */ if (iloc == InteractionLocality::NonLocal) { - cl_error = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &(nb->nonlocal_done)); - GMX_ASSERT(cl_error == CL_SUCCESS, - ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str()); - nb->bNonLocalStreamActive = CL_TRUE; + nb->nonlocal_done.markEvent(deviceStream); + nb->bNonLocalStreamDoneMarked = true; } /* only transfer energies in the local stream */ diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp index a70b2b8a71..7472b37c70 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp @@ -633,18 +633,6 @@ void gpu_free(NbnxmGpu* nb) pfree(nb->nbst.fshift); nb->nbst.fshift = nullptr; - /* Free other events */ - if (nb->nonlocal_done) - { - clReleaseEvent(nb->nonlocal_done); - nb->nonlocal_done = nullptr; - } - if (nb->misc_ops_and_local_H2D_done) - { - clReleaseEvent(nb->misc_ops_and_local_H2D_done); - nb->misc_ops_and_local_H2D_done = nullptr; - } - freeGpuProgram(nb->dev_rundata->program); delete nb->dev_rundata; diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h index d1adddb205..2e7098227d 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h @@ -2,7 +2,7 @@ * This file is part of the GROMACS molecular simulation package. * * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team. - * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -48,6 +48,7 @@ #include "gromacs/gpu_utils/devicebuffer.h" #include "gromacs/gpu_utils/gmxopencl.h" +#include "gromacs/gpu_utils/gpueventsynchronizer_ocl.h" #include "gromacs/gpu_utils/gputraits_ocl.h" #include "gromacs/gpu_utils/oclutils.h" #include "gromacs/mdtypes/interaction_const.h" @@ -240,8 +241,8 @@ struct NbnxmGpu //! true if doing both local/non-local NB work on GPU bool bUseTwoStreams = false; - //! true indicates that the nonlocal_done event was enqueued - bool bNonLocalStreamActive = false; + //! true indicates that the nonlocal_done event was marked + bool bNonLocalStreamDoneMarked = false; //! atom data cl_atomdata_t* atdat = nullptr; @@ -259,13 +260,13 @@ struct NbnxmGpu /*! \{ */ /*! \brief Event triggered when the non-local non-bonded * kernel is done (and the local transfer can proceed) */ - cl_event nonlocal_done = nullptr; + GpuEventSynchronizer nonlocal_done; /*! \brief Event triggered when the tasks issued in the local * stream that need to precede the non-local force or buffer * operation calculations are done (e.g. f buffer 0-ing, local * x/q H2D, buffer op initialization in local stream that is * required also by nonlocal stream ) */ - cl_event misc_ops_and_local_H2D_done = nullptr; + GpuEventSynchronizer misc_ops_and_local_H2D_done; /*! \} */ //! True if there has been local/nonlocal GPU work, either bonded or nonbonded, scheduled