From 8bda443515edfc586752153ab847205c9b8dabd6 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Mon, 22 Feb 2021 21:27:08 +0300 Subject: [PATCH] Use getAtomRanges(...) function in NBNXM more Make getAtomRanges(...) function to return gmx::Range and use it more. Refs #2608 --- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 31 ++++++----------- src/gromacs/nbnxm/gpu_common.h | 16 +++------ src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp | 48 ++++++++++---------------- src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp | 20 +++++------ 4 files changed, 43 insertions(+), 72 deletions(-) diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 594c4ca291..ccb1fdf26d 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -454,7 +454,6 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); - int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ NBAtomData* adat = nb->atdat; gpu_plist* plist = nb->plist[iloc]; @@ -484,17 +483,8 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom return; } - /* calculate the atom data index range based on locality */ - if (atomLocality == AtomLocality::Local) - { - adat_begin = 0; - adat_len = adat->numAtomsLocal; - } - else - { - adat_begin = adat->numAtomsLocal; - adat_len = adat->numAtoms - adat->numAtomsLocal; - } + /* local/nonlocal offset and length used for xq and f */ + auto atomsRange = getGpuAtomRange(adat, atomLocality); /* beginning of timed HtoD section */ if (bDoTime) @@ -506,9 +496,9 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom static_assert(sizeof(adat->xq[0]) == sizeof(Float4), "The size of the xyzq buffer element should be equal to the size of float4."); copyToDeviceBuffer(&adat->xq, - reinterpret_cast(nbatom->x().data()) + adat_begin, - adat_begin, - adat_len, + reinterpret_cast(nbatom->x().data()) + atomsRange.begin(), + atomsRange.begin(), + atomsRange.size(), deviceStream, GpuApiCallBehavior::Async, nullptr); @@ -801,8 +791,6 @@ void gpu_launch_cpyback(NbnxmGpu* nb, { GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ - /* determine interaction locality from atom locality */ const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); GMX_ASSERT(iloc == InteractionLocality::Local @@ -823,7 +811,8 @@ void gpu_launch_cpyback(NbnxmGpu* nb, return; } - getGpuAtomRange(adat, atomLocality, &adat_begin, &adat_len); + /* local/nonlocal offset and length used for xq and f */ + auto atomsRange = getGpuAtomRange(adat, atomLocality); /* beginning of timed D2H section */ if (bDoTime) @@ -847,10 +836,10 @@ void gpu_launch_cpyback(NbnxmGpu* nb, static_assert( sizeof(adat->f[0]) == sizeof(Float3), "The size of the force buffer element should be equal to the size of float3."); - copyFromDeviceBuffer(reinterpret_cast(nbatom->out[0].f.data()) + adat_begin, + copyFromDeviceBuffer(reinterpret_cast(nbatom->out[0].f.data()) + atomsRange.begin(), &adat->f, - adat_begin, - adat_len, + atomsRange.begin(), + atomsRange.size(), deviceStream, GpuApiCallBehavior::Async, nullptr); diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h index dccfe1eed0..92002bbf4b 100644 --- a/src/gromacs/nbnxm/gpu_common.h +++ b/src/gromacs/nbnxm/gpu_common.h @@ -68,6 +68,7 @@ #include "gromacs/timing/gpu_timing.h" #include "gromacs/timing/wallcycle.h" #include "gromacs/utility/fatalerror.h" +#include "gromacs/utility/range.h" #include "gromacs/utility/stringutil.h" #include "gpu_common_utils.h" @@ -167,14 +168,9 @@ bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality * * \param[in] atomData Atom descriptor data structure * \param[in] atomLocality Atom locality specifier - * \param[out] atomRangeBegin Starting index of the atom range in the atom data array. - * \param[out] atomRangeLen Atom range length in the atom data array. + * \returns Range of indexes for selected locality. */ -template -static inline void getGpuAtomRange(const AtomDataT* atomData, - const AtomLocality atomLocality, - int* atomRangeBegin, - int* atomRangeLen) +static inline gmx::Range getGpuAtomRange(const NBAtomData* atomData, const AtomLocality atomLocality) { assert(atomData); validateGpuAtomLocality(atomLocality); @@ -182,13 +178,11 @@ static inline void getGpuAtomRange(const AtomDataT* atomData, /* calculate the atom data index range based on locality */ if (atomLocality == AtomLocality::Local) { - *atomRangeBegin = 0; - *atomRangeLen = atomData->numAtomsLocal; + return gmx::Range(0, atomData->numAtomsLocal); } else { - *atomRangeBegin = atomData->numAtomsLocal; - *atomRangeLen = atomData->numAtoms - atomData->numAtomsLocal; + return gmx::Range(atomData->numAtomsLocal, atomData->numAtoms); } } diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index e00874a30b..50e7b9d8d4 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -527,9 +527,6 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); - /* local/nonlocal offset and length used for xq and f */ - int adat_begin, adat_len; - NBAtomData* adat = nb->atdat; gpu_plist* plist = nb->plist[iloc]; cl_timers_t* t = nb->timers; @@ -558,17 +555,8 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom return; } - /* calculate the atom data index range based on locality */ - if (atomLocality == AtomLocality::Local) - { - adat_begin = 0; - adat_len = adat->numAtomsLocal; - } - else - { - adat_begin = adat->numAtomsLocal; - adat_len = adat->numAtoms - adat->numAtomsLocal; - } + /* local/nonlocal offset and length used for xq and f */ + auto atomsRange = getGpuAtomRange(adat, atomLocality); /* beginning of timed HtoD section */ if (bDoTime) @@ -580,9 +568,9 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom static_assert(sizeof(float) == sizeof(*nbatom->x().data()), "The size of the xyzq buffer element should be equal to the size of float4."); copyToDeviceBuffer(&adat->xq, - reinterpret_cast(nbatom->x().data()) + adat_begin, - adat_begin, - adat_len, + reinterpret_cast(nbatom->x().data()) + atomsRange.begin(), + atomsRange.begin(), + atomsRange.size(), deviceStream, GpuApiCallBehavior::Async, bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr); @@ -931,15 +919,14 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c void gpu_launch_cpyback(NbnxmGpu* nb, struct nbnxn_atomdata_t* nbatom, const gmx::StepWorkload& stepWork, - const AtomLocality aloc) + const AtomLocality atomLocality) { GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); cl_int gmx_unused cl_error; - int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ /* determine interaction locality from atom locality */ - const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc); + const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); GMX_ASSERT(iloc == InteractionLocality::Local || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false), "Non-local stream is indicating that the copy back event is enqueued at the " @@ -965,12 +952,13 @@ void gpu_launch_cpyback(NbnxmGpu* nb, return; } - getGpuAtomRange(adat, aloc, &adat_begin, &adat_len); + /* local/nonlocal offset and length used for xq and f */ + auto atomsRange = getGpuAtomRange(adat, atomLocality); /* beginning of timed D2H section */ if (bDoTime) { - t->xf[aloc].nb_d2h.openTimingRegion(deviceStream); + t->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream); } /* With DD the local D2H transfer can only start after the non-local @@ -984,13 +972,13 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* DtoH f */ GMX_ASSERT(sizeof(*nbatom->out[0].f.data()) == sizeof(float), "The host force buffer should be in single precision to match device data size."); - copyFromDeviceBuffer(reinterpret_cast(nbatom->out[0].f.data()) + adat_begin, + copyFromDeviceBuffer(reinterpret_cast(nbatom->out[0].f.data()) + atomsRange.begin(), &adat->f, - adat_begin, - adat_len, + atomsRange.begin(), + atomsRange.size(), deviceStream, GpuApiCallBehavior::Async, - bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); + bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); /* kick off work */ cl_error = clFlush(deviceStream.stream()); @@ -1021,7 +1009,7 @@ void gpu_launch_cpyback(NbnxmGpu* nb, SHIFTS, deviceStream, GpuApiCallBehavior::Async, - bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); + bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); } /* DtoH energies */ @@ -1035,7 +1023,7 @@ void gpu_launch_cpyback(NbnxmGpu* nb, 1, deviceStream, GpuApiCallBehavior::Async, - bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); + bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); static_assert(sizeof(*nb->nbst.eElec) == sizeof(float), "Sizes of host- and device-side electrostatic energy terms should be the " "same."); @@ -1045,13 +1033,13 @@ void gpu_launch_cpyback(NbnxmGpu* nb, 1, deviceStream, GpuApiCallBehavior::Async, - bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); + bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr); } } if (bDoTime) { - t->xf[aloc].nb_d2h.closeTimingRegion(deviceStream); + t->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream); } } diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp index 60ba8b32f4..7428869888 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp @@ -96,8 +96,8 @@ void gpu_launch_cpyback(NbnxmGpu* nb, return; } - int adatBegin, adatLen; - getGpuAtomRange(adat, atomLocality, &adatBegin, &adatLen); + /* local/nonlocal offset and length used for xq and f */ + auto atomsRange = getGpuAtomRange(adat, atomLocality); // With DD the local D2H transfer can only start after the non-local kernel has finished. if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked) @@ -113,10 +113,10 @@ void gpu_launch_cpyback(NbnxmGpu* nb, { GMX_ASSERT(adat->f.elementSize() == sizeof(Float3), "The size of the force buffer element should be equal to the size of float3."); - copyFromDeviceBuffer(reinterpret_cast(nbatom->out[0].f.data()) + adatBegin, + copyFromDeviceBuffer(reinterpret_cast(nbatom->out[0].f.data()) + atomsRange.begin(), &adat->f, - adatBegin, - adatLen, + atomsRange.begin(), + atomsRange.size(), deviceStream, GpuApiCallBehavior::Async, nullptr); @@ -193,16 +193,16 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom return; } - int adatBegin, adatLen; - getGpuAtomRange(adat, atomLocality, &adatBegin, &adatLen); + /* local/nonlocal offset and length used for xq and f */ + auto atomsRange = getGpuAtomRange(adat, atomLocality); /* HtoD x, q */ GMX_ASSERT(adat->xq.elementSize() == sizeof(Float4), "The size of the xyzq buffer element should be equal to the size of float4."); copyToDeviceBuffer(&adat->xq, - reinterpret_cast(nbatom->x().data()) + adatBegin, - adatBegin, - adatLen, + reinterpret_cast(nbatom->x().data()) + atomsRange.begin(), + atomsRange.begin(), + atomsRange.size(), deviceStream, GpuApiCallBehavior::Async, nullptr); -- 2.22.0