From 8bda443515edfc586752153ab847205c9b8dabd6 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Mon, 22 Feb 2021 21:27:08 +0300
Subject: [PATCH] Use getAtomRanges(...) function in NBNXM more

Make getAtomRanges(...) function to return gmx::Range and
use it more.

Refs #2608
---
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu   | 31 ++++++-----------
 src/gromacs/nbnxm/gpu_common.h         | 16 +++------
 src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp | 48 ++++++++++----------------
 src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp  | 20 +++++------
 4 files changed, 43 insertions(+), 72 deletions(-)
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index 594c4ca291..ccb1fdf26d 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -454,7 +454,6 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
 
     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 
-    int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 
     NBAtomData*         adat         = nb->atdat;
     gpu_plist*          plist        = nb->plist[iloc];
@@ -484,17 +483,8 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
         return;
     }
 
-    /* calculate the atom data index range based on locality */
-    if (atomLocality == AtomLocality::Local)
-    {
-        adat_begin = 0;
-        adat_len   = adat->numAtomsLocal;
-    }
-    else
-    {
-        adat_begin = adat->numAtomsLocal;
-        adat_len   = adat->numAtoms - adat->numAtomsLocal;
-    }
+    /* local/nonlocal offset and length used for xq and f */
+    auto atomsRange = getGpuAtomRange(adat, atomLocality);
 
     /* beginning of timed HtoD section */
     if (bDoTime)
@@ -506,9 +496,9 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     static_assert(sizeof(adat->xq[0]) == sizeof(Float4),
                   "The size of the xyzq buffer element should be equal to the size of float4.");
     copyToDeviceBuffer(&adat->xq,
-                       reinterpret_cast<const Float4*>(nbatom->x().data()) + adat_begin,
-                       adat_begin,
-                       adat_len,
+                       reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
+                       atomsRange.begin(),
+                       atomsRange.size(),
                        deviceStream,
                        GpuApiCallBehavior::Async,
                        nullptr);
@@ -801,8 +791,6 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
-    int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
-
     /* determine interaction locality from atom locality */
     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
     GMX_ASSERT(iloc == InteractionLocality::Local
@@ -823,7 +811,8 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
         return;
     }
 
-    getGpuAtomRange(adat, atomLocality, &adat_begin, &adat_len);
+    /* local/nonlocal offset and length used for xq and f */
+    auto atomsRange = getGpuAtomRange(adat, atomLocality);
 
     /* beginning of timed D2H section */
     if (bDoTime)
@@ -847,10 +836,10 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
         static_assert(
                 sizeof(adat->f[0]) == sizeof(Float3),
                 "The size of the force buffer element should be equal to the size of float3.");
-        copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + adat_begin,
+        copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
                              &adat->f,
-                             adat_begin,
-                             adat_len,
+                             atomsRange.begin(),
+                             atomsRange.size(),
                              deviceStream,
                              GpuApiCallBehavior::Async,
                              nullptr);
diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h
index dccfe1eed0..92002bbf4b 100644
--- a/src/gromacs/nbnxm/gpu_common.h
+++ b/src/gromacs/nbnxm/gpu_common.h
@@ -68,6 +68,7 @@
 #include "gromacs/timing/gpu_timing.h"
 #include "gromacs/timing/wallcycle.h"
 #include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/range.h"
 #include "gromacs/utility/stringutil.h"
 
 #include "gpu_common_utils.h"
@@ -167,14 +168,9 @@ bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::AtomLocality aLocality
  *
  * \param[in] atomData Atom descriptor data structure
  * \param[in] atomLocality Atom locality specifier
- * \param[out] atomRangeBegin Starting index of the atom range in the atom data array.
- * \param[out] atomRangeLen Atom range length in the atom data array.
+ * \returns Range of indexes for selected locality.
  */
-template<typename AtomDataT>
-static inline void getGpuAtomRange(const AtomDataT*   atomData,
-                                   const AtomLocality atomLocality,
-                                   int*               atomRangeBegin,
-                                   int*               atomRangeLen)
+static inline gmx::Range<int> getGpuAtomRange(const NBAtomData* atomData, const AtomLocality atomLocality)
 {
     assert(atomData);
     validateGpuAtomLocality(atomLocality);
@@ -182,13 +178,11 @@ static inline void getGpuAtomRange(const AtomDataT*   atomData,
     /* calculate the atom data index range based on locality */
     if (atomLocality == AtomLocality::Local)
     {
-        *atomRangeBegin = 0;
-        *atomRangeLen   = atomData->numAtomsLocal;
+        return gmx::Range<int>(0, atomData->numAtomsLocal);
     }
     else
     {
-        *atomRangeBegin = atomData->numAtomsLocal;
-        *atomRangeLen   = atomData->numAtoms - atomData->numAtomsLocal;
+        return gmx::Range<int>(atomData->numAtomsLocal, atomData->numAtoms);
     }
 }
 
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
index e00874a30b..50e7b9d8d4 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -527,9 +527,6 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
 
     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 
-    /* local/nonlocal offset and length used for xq and f */
-    int adat_begin, adat_len;
-
     NBAtomData*         adat         = nb->atdat;
     gpu_plist*          plist        = nb->plist[iloc];
     cl_timers_t*        t            = nb->timers;
@@ -558,17 +555,8 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
         return;
     }
 
-    /* calculate the atom data index range based on locality */
-    if (atomLocality == AtomLocality::Local)
-    {
-        adat_begin = 0;
-        adat_len   = adat->numAtomsLocal;
-    }
-    else
-    {
-        adat_begin = adat->numAtomsLocal;
-        adat_len   = adat->numAtoms - adat->numAtomsLocal;
-    }
+    /* local/nonlocal offset and length used for xq and f */
+    auto atomsRange = getGpuAtomRange(adat, atomLocality);
 
     /* beginning of timed HtoD section */
     if (bDoTime)
@@ -580,9 +568,9 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     static_assert(sizeof(float) == sizeof(*nbatom->x().data()),
                   "The size of the xyzq buffer element should be equal to the size of float4.");
     copyToDeviceBuffer(&adat->xq,
-                       reinterpret_cast<const Float4*>(nbatom->x().data()) + adat_begin,
-                       adat_begin,
-                       adat_len,
+                       reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
+                       atomsRange.begin(),
+                       atomsRange.size(),
                        deviceStream,
                        GpuApiCallBehavior::Async,
                        bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
@@ -931,15 +919,14 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
 void gpu_launch_cpyback(NbnxmGpu*                nb,
                         struct nbnxn_atomdata_t* nbatom,
                         const gmx::StepWorkload& stepWork,
-                        const AtomLocality       aloc)
+                        const AtomLocality       atomLocality)
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
     cl_int gmx_unused cl_error;
-    int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 
     /* determine interaction locality from atom locality */
-    const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
+    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
     GMX_ASSERT(iloc == InteractionLocality::Local
                        || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
                "Non-local stream is indicating that the copy back event is enqueued at the "
@@ -965,12 +952,13 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
         return;
     }
 
-    getGpuAtomRange(adat, aloc, &adat_begin, &adat_len);
+    /* local/nonlocal offset and length used for xq and f */
+    auto atomsRange = getGpuAtomRange(adat, atomLocality);
 
     /* beginning of timed D2H section */
     if (bDoTime)
     {
-        t->xf[aloc].nb_d2h.openTimingRegion(deviceStream);
+        t->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
     }
 
     /* With DD the local D2H transfer can only start after the non-local
@@ -984,13 +972,13 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     /* DtoH f */
     GMX_ASSERT(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
                "The host force buffer should be in single precision to match device data size.");
-    copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + adat_begin,
+    copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
                          &adat->f,
-                         adat_begin,
-                         adat_len,
+                         atomsRange.begin(),
+                         atomsRange.size(),
                          deviceStream,
                          GpuApiCallBehavior::Async,
-                         bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+                         bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
 
     /* kick off work */
     cl_error = clFlush(deviceStream.stream());
@@ -1021,7 +1009,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
                                  SHIFTS,
                                  deviceStream,
                                  GpuApiCallBehavior::Async,
-                                 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+                                 bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
         }
 
         /* DtoH energies */
@@ -1035,7 +1023,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
                                  1,
                                  deviceStream,
                                  GpuApiCallBehavior::Async,
-                                 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+                                 bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
             static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
                           "Sizes of host- and device-side electrostatic energy terms should be the "
                           "same.");
@@ -1045,13 +1033,13 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
                                  1,
                                  deviceStream,
                                  GpuApiCallBehavior::Async,
-                                 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
+                                 bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
         }
     }
 
     if (bDoTime)
     {
-        t->xf[aloc].nb_d2h.closeTimingRegion(deviceStream);
+        t->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
     }
 }
 
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp
index 60ba8b32f4..7428869888 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp
@@ -96,8 +96,8 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
         return;
     }
 
-    int adatBegin, adatLen;
-    getGpuAtomRange(adat, atomLocality, &adatBegin, &adatLen);
+    /* local/nonlocal offset and length used for xq and f */
+    auto atomsRange = getGpuAtomRange(adat, atomLocality);
 
     // With DD the local D2H transfer can only start after the non-local kernel has finished.
     if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
@@ -113,10 +113,10 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     {
         GMX_ASSERT(adat->f.elementSize() == sizeof(Float3),
                    "The size of the force buffer element should be equal to the size of float3.");
-        copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + adatBegin,
+        copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
                              &adat->f,
-                             adatBegin,
-                             adatLen,
+                             atomsRange.begin(),
+                             atomsRange.size(),
                              deviceStream,
                              GpuApiCallBehavior::Async,
                              nullptr);
@@ -193,16 +193,16 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
         return;
     }
 
-    int adatBegin, adatLen;
-    getGpuAtomRange(adat, atomLocality, &adatBegin, &adatLen);
+    /* local/nonlocal offset and length used for xq and f */
+    auto atomsRange = getGpuAtomRange(adat, atomLocality);
 
     /* HtoD x, q */
     GMX_ASSERT(adat->xq.elementSize() == sizeof(Float4),
                "The size of the xyzq buffer element should be equal to the size of float4.");
     copyToDeviceBuffer(&adat->xq,
-                       reinterpret_cast<const Float4*>(nbatom->x().data()) + adatBegin,
-                       adatBegin,
-                       adatLen,
+                       reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
+                       atomsRange.begin(),
+                       atomsRange.size(),
                        deviceStream,
                        GpuApiCallBehavior::Async,
                        nullptr);
-- 
2.22.0