From d2d4a50b4c636c203028c5bff311924ec15e7825 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Tue, 16 Mar 2021 16:26:37 +0300 Subject: [PATCH] Unify gpu_launch_cpyback(...) function in NBNXM Refs #2608 --- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 104 ----------------- src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp | 130 +++++++++++++++++++++ src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp | 132 ---------------------- src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp | 91 --------------- 4 files changed, 130 insertions(+), 327 deletions(-) diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 6e3ad5cdfe..bd5fa8de5d 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -692,110 +692,6 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c } } -void gpu_launch_cpyback(NbnxmGpu* nb, - nbnxn_atomdata_t* nbatom, - const gmx::StepWorkload& stepWork, - const AtomLocality atomLocality) -{ - GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - - /* determine interaction locality from atom locality */ - const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); - GMX_ASSERT(iloc == InteractionLocality::Local - || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false), - "Non-local stream is indicating that the copy back event is enqueued at the " - "beginning of the copy back function."); - - /* extract the data */ - NBAtomData* adat = nb->atdat; - Nbnxm::GpuTimers* timers = nb->timers; - bool bDoTime = nb->bDoTime; - const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; - - /* don't launch non-local copy-back if there was no non-local work to do */ - if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) - { - nb->bNonLocalStreamDoneMarked = false; - return; - } - - /* local/nonlocal offset and length used for xq and f */ - auto atomsRange = getGpuAtomRange(adat, atomLocality); - - /* beginning of timed D2H section */ - if (bDoTime) - { - timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream); - } - - /* With DD the local D2H transfer can only start after the non-local - kernel has finished. */ - if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked) - { - nb->nonlocal_done.enqueueWaitEvent(deviceStream); - nb->bNonLocalStreamDoneMarked = false; - } - - /* DtoH f - * Skip if buffer ops / reduction is offloaded to the GPU. - */ - if (!stepWork.useGpuFBufferOps) - { - static_assert( - sizeof(adat->f[0]) == sizeof(Float3), - "The size of the force buffer element should be equal to the size of float3."); - copyFromDeviceBuffer(reinterpret_cast(nbatom->out[0].f.data()) + atomsRange.begin(), - &adat->f, - atomsRange.begin(), - atomsRange.size(), - deviceStream, - GpuApiCallBehavior::Async, - nullptr); - } - - /* After the non-local D2H is launched the nonlocal_done event can be - recorded which signals that the local D2H can proceed. This event is not - placed after the non-local kernel because we want the non-local data - back first. */ - if (iloc == InteractionLocality::NonLocal) - { - nb->nonlocal_done.markEvent(deviceStream); - nb->bNonLocalStreamDoneMarked = true; - } - - /* only transfer energies in the local stream */ - if (iloc == InteractionLocality::Local) - { - /* DtoH fshift when virial is needed */ - if (stepWork.computeVirial) - { - static_assert(sizeof(nb->nbst.fShift[0]) == sizeof(adat->fShift[0]), - "Sizes of host- and device-side shift vectors should be the same."); - copyFromDeviceBuffer( - nb->nbst.fShift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr); - } - - /* DtoH energies */ - if (stepWork.computeEnergy) - { - static_assert(sizeof(nb->nbst.eLJ[0]) == sizeof(adat->eLJ[0]), - "Sizes of host- and device-side LJ energy terms should be the same."); - copyFromDeviceBuffer( - nb->nbst.eLJ, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr); - static_assert(sizeof(nb->nbst.eElec[0]) == sizeof(adat->eElec[0]), - "Sizes of host- and device-side electrostatic energy terms should be the " - "same."); - copyFromDeviceBuffer( - nb->nbst.eElec, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr); - } - } - - if (bDoTime) - { - timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream); - } -} - void cuda_set_cacheconfig() { cudaError_t stat; diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp index f92f636e19..50519ced6d 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp +++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp @@ -64,8 +64,10 @@ #include "gromacs/hardware/device_information.h" #include "gromacs/mdtypes/interaction_const.h" +#include "gromacs/mdtypes/simulation_workload.h" #include "gromacs/nbnxm/gpu_common_utils.h" #include "gromacs/nbnxm/gpu_data_mgmt.h" +#include "gromacs/pbcutil/ishift.h" #include "gromacs/timing/gpu_timing.h" #include "gromacs/utility/cstringutil.h" #include "gromacs/utility/exceptions.h" @@ -541,6 +543,134 @@ bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality return haveGpuShortRangeWork(*nb, gpuAtomToInteractionLocality(aLocality)); } +/*! \brief + * Launch asynchronously the download of nonbonded forces from the GPU + * (and energies/shift forces if required). + */ +void gpu_launch_cpyback(NbnxmGpu* nb, + struct nbnxn_atomdata_t* nbatom, + const gmx::StepWorkload& stepWork, + const AtomLocality atomLocality) +{ + GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); + + /* determine interaction locality from atom locality */ + const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); + GMX_ASSERT(iloc == InteractionLocality::Local + || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false), + "Non-local stream is indicating that the copy back event is enqueued at the " + "beginning of the copy back function."); + + /* extract the data */ + NBAtomData* adat = nb->atdat; + Nbnxm::GpuTimers* timers = nb->timers; + bool bDoTime = nb->bDoTime; + const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; + + /* don't launch non-local copy-back if there was no non-local work to do */ + if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) + { + /* TODO An alternative way to signal that non-local work is + complete is to use a clEnqueueMarker+clEnqueueBarrier + pair. However, the use of bNonLocalStreamDoneMarked has the + advantage of being local to the host, so probably minimizes + overhead. Curiously, for NVIDIA OpenCL with an empty-domain + test case, overall simulation performance was higher with + the API calls, but this has not been tested on AMD OpenCL, + so could be worth considering in future. */ + nb->bNonLocalStreamDoneMarked = false; + return; + } + + /* local/nonlocal offset and length used for xq and f */ + auto atomsRange = getGpuAtomRange(adat, atomLocality); + + /* beginning of timed D2H section */ + if (bDoTime) + { + timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream); + } + + /* With DD the local D2H transfer can only start after the non-local + has been launched. */ + if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked) + { + nb->nonlocal_done.enqueueWaitEvent(deviceStream); + nb->bNonLocalStreamDoneMarked = false; + } + + /* DtoH f */ + static_assert(sizeof(*nbatom->out[0].f.data()) == sizeof(float), + "The host force buffer should be in single precision to match device data size."); + copyFromDeviceBuffer(reinterpret_cast(nbatom->out[0].f.data()) + atomsRange.begin(), + &adat->f, + atomsRange.begin(), + atomsRange.size(), + deviceStream, + GpuApiCallBehavior::Async, + bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); + + issueClFlushInStream(deviceStream); + + /* After the non-local D2H is launched the nonlocal_done event can be + recorded which signals that the local D2H can proceed. This event is not + placed after the non-local kernel because we first need the non-local + data back first. */ + if (iloc == InteractionLocality::NonLocal) + { + nb->nonlocal_done.markEvent(deviceStream); + nb->bNonLocalStreamDoneMarked = true; + } + + /* only transfer energies in the local stream */ + if (iloc == InteractionLocality::Local) + { + /* DtoH fshift when virial is needed */ + if (stepWork.computeVirial) + { + static_assert( + sizeof(*nb->nbst.fShift) == sizeof(Float3), + "Sizes of host- and device-side shift vector elements should be the same."); + copyFromDeviceBuffer(nb->nbst.fShift, + &adat->fShift, + 0, + SHIFTS, + deviceStream, + GpuApiCallBehavior::Async, + bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); + } + + /* DtoH energies */ + if (stepWork.computeEnergy) + { + static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float), + "Sizes of host- and device-side LJ energy terms should be the same."); + copyFromDeviceBuffer(nb->nbst.eLJ, + &adat->eLJ, + 0, + 1, + deviceStream, + GpuApiCallBehavior::Async, + bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); + static_assert(sizeof(*nb->nbst.eElec) == sizeof(float), + "Sizes of host- and device-side electrostatic energy terms should be the " + "same."); + copyFromDeviceBuffer(nb->nbst.eElec, + &adat->eElec, + 0, + 1, + deviceStream, + GpuApiCallBehavior::Async, + bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); + } + } + + if (bDoTime) + { + timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream); + } +} + void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality) { const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality]; diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index af7f4ad86f..d14aad3da1 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -81,7 +81,6 @@ #include "gromacs/nbnxm/nbnxm.h" #include "gromacs/nbnxm/nbnxm_gpu.h" #include "gromacs/nbnxm/pairlist.h" -#include "gromacs/pbcutil/ishift.h" #include "gromacs/timing/gpu_timing.h" #include "gromacs/utility/cstringutil.h" #include "gromacs/utility/fatalerror.h" @@ -812,135 +811,4 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c } } -/*! \brief - * Launch asynchronously the download of nonbonded forces from the GPU - * (and energies/shift forces if required). - */ -void gpu_launch_cpyback(NbnxmGpu* nb, - struct nbnxn_atomdata_t* nbatom, - const gmx::StepWorkload& stepWork, - const AtomLocality atomLocality) -{ - GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - - cl_int gmx_unused cl_error; - - /* determine interaction locality from atom locality */ - const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); - GMX_ASSERT(iloc == InteractionLocality::Local - || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false), - "Non-local stream is indicating that the copy back event is enqueued at the " - "beginning of the copy back function."); - - NBAtomData* adat = nb->atdat; - Nbnxm::GpuTimers* timers = nb->timers; - bool bDoTime = nb->bDoTime; - const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; - - /* don't launch non-local copy-back if there was no non-local work to do */ - if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) - { - /* TODO An alternative way to signal that non-local work is - complete is to use a clEnqueueMarker+clEnqueueBarrier - pair. However, the use of bNonLocalStreamDoneMarked has the - advantage of being local to the host, so probably minimizes - overhead. Curiously, for NVIDIA OpenCL with an empty-domain - test case, overall simulation performance was higher with - the API calls, but this has not been tested on AMD OpenCL, - so could be worth considering in future. */ - nb->bNonLocalStreamDoneMarked = false; - return; - } - - /* local/nonlocal offset and length used for xq and f */ - auto atomsRange = getGpuAtomRange(adat, atomLocality); - - /* beginning of timed D2H section */ - if (bDoTime) - { - timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream); - } - - /* With DD the local D2H transfer can only start after the non-local - has been launched. */ - if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked) - { - nb->nonlocal_done.enqueueWaitEvent(deviceStream); - nb->bNonLocalStreamDoneMarked = false; - } - - /* DtoH f */ - GMX_ASSERT(sizeof(*nbatom->out[0].f.data()) == sizeof(float), - "The host force buffer should be in single precision to match device data size."); - copyFromDeviceBuffer(reinterpret_cast(nbatom->out[0].f.data()) + atomsRange.begin(), - &adat->f, - atomsRange.begin(), - atomsRange.size(), - deviceStream, - GpuApiCallBehavior::Async, - bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); - - /* kick off work */ - cl_error = clFlush(deviceStream.stream()); - GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str()); - - /* After the non-local D2H is launched the nonlocal_done event can be - recorded which signals that the local D2H can proceed. This event is not - placed after the non-local kernel because we first need the non-local - data back first. */ - if (iloc == InteractionLocality::NonLocal) - { - nb->nonlocal_done.markEvent(deviceStream); - nb->bNonLocalStreamDoneMarked = true; - } - - /* only transfer energies in the local stream */ - if (iloc == InteractionLocality::Local) - { - /* DtoH fshift when virial is needed */ - if (stepWork.computeVirial) - { - static_assert( - sizeof(*nb->nbst.fShift) == sizeof(Float3), - "Sizes of host- and device-side shift vector elements should be the same."); - copyFromDeviceBuffer(nb->nbst.fShift, - &adat->fShift, - 0, - SHIFTS, - deviceStream, - GpuApiCallBehavior::Async, - bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); - } - - /* DtoH energies */ - if (stepWork.computeEnergy) - { - static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float), - "Sizes of host- and device-side LJ energy terms should be the same."); - copyFromDeviceBuffer(nb->nbst.eLJ, - &adat->eLJ, - 0, - 1, - deviceStream, - GpuApiCallBehavior::Async, - bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); - static_assert(sizeof(*nb->nbst.eElec) == sizeof(float), - "Sizes of host- and device-side electrostatic energy terms should be the " - "same."); - copyFromDeviceBuffer(nb->nbst.eElec, - &adat->eElec, - 0, - 1, - deviceStream, - GpuApiCallBehavior::Async, - bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); - } - } - - if (bDoTime) - { - timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream); - } -} - } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp index 1a130a6fb5..d508a20ef1 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp @@ -51,97 +51,6 @@ namespace Nbnxm { -/*! \brief - * Launch asynchronously the download of nonbonded forces from the GPU - * (and energies/shift forces if required). - */ -void gpu_launch_cpyback(NbnxmGpu* nb, - struct nbnxn_atomdata_t* nbatom, - const gmx::StepWorkload& stepWork, - const AtomLocality atomLocality) -{ - GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - - const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); - GMX_ASSERT(iloc == InteractionLocality::Local - || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false), - "Non-local stream is indicating that the copy back event is enqueued at the " - "beginning of the copy back function."); - - const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; - NBAtomData* adat = nb->atdat; - - /* don't launch non-local copy-back if there was no non-local work to do */ - if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) - { - nb->bNonLocalStreamDoneMarked = false; - return; - } - - /* local/nonlocal offset and length used for xq and f */ - auto atomsRange = getGpuAtomRange(adat, atomLocality); - - // With DD the local D2H transfer can only start after the non-local kernel has finished. - if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked) - { - nb->nonlocal_done.waitForEvent(); - nb->bNonLocalStreamDoneMarked = false; - } - - /* DtoH f - * Skip if buffer ops / reduction is offloaded to the GPU. - */ - if (!stepWork.useGpuFBufferOps) - { - GMX_ASSERT(adat->f.elementSize() == sizeof(Float3), - "The size of the force buffer element should be equal to the size of float3."); - copyFromDeviceBuffer(reinterpret_cast(nbatom->out[0].f.data()) + atomsRange.begin(), - &adat->f, - atomsRange.begin(), - atomsRange.size(), - deviceStream, - GpuApiCallBehavior::Async, - nullptr); - } - - /* After the non-local D2H is launched the nonlocal_done event can be - recorded which signals that the local D2H can proceed. This event is not - placed after the non-local kernel because we want the non-local data - back first. */ - if (iloc == InteractionLocality::NonLocal) - { - nb->nonlocal_done.markEvent(deviceStream); - nb->bNonLocalStreamDoneMarked = true; - } - - /* only transfer energies in the local stream */ - if (iloc == InteractionLocality::Local) - { - /* DtoH fshift when virial is needed */ - if (stepWork.computeVirial) - { - GMX_ASSERT(sizeof(*nb->nbst.fShift) == adat->fShift.elementSize(), - "Sizes of host- and device-side shift vector elements should be the same."); - copyFromDeviceBuffer( - nb->nbst.fShift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr); - } - - /* DtoH energies */ - if (stepWork.computeEnergy) - { - GMX_ASSERT(sizeof(*nb->nbst.eLJ) == sizeof(float), - "Sizes of host- and device-side LJ energy terms should be the same."); - copyFromDeviceBuffer( - nb->nbst.eLJ, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr); - GMX_ASSERT(sizeof(*nb->nbst.eElec) == sizeof(float), - "Sizes of host- and device-side electrostatic energy terms should be the " - "same."); - copyFromDeviceBuffer( - nb->nbst.eElec, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr); - } - } -} - void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts) { gpu_plist* plist = nb->plist[iloc]; -- 2.22.0