From: Artem Zhmurov <zhmurov@gmail.com>
Date: Mon, 22 Feb 2021 09:43:43 +0000 (+0000)
Subject: Add FloatN aliases to OpenCL and use them in NBNXM
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=c66827166fc9099ecd1a4a2f7080558df70bf529;p=alexxy%2Fgromacs.git

Add FloatN aliases to OpenCL and use them in NBNXM

These aliases are nessesary to unify OpenCL, CUDA and SYCL
code.

Refs #3312, #2608, #3311
---

diff --git a/src/gromacs/gpu_utils/gputraits_ocl.h b/src/gromacs/gpu_utils/gputraits_ocl.h
index 489bb0527c..a8a3c26818 100644
--- a/src/gromacs/gpu_utils/gputraits_ocl.h
+++ b/src/gromacs/gpu_utils/gputraits_ocl.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -46,12 +46,22 @@
  */
 
 #include "gromacs/gpu_utils/gmxopencl.h"
+#include "gromacs/math/vectypes.h"
 
 using DeviceTexture = void*;
 
 //! \brief Single GPU call timing event
 using CommandEvent = cl_event;
 
+//! Convenience alias for 2-wide float
+using Float2 = cl_float2;
+
+//! Convenience alias for 3-wide float. Not using cl_float3 due to alignment issues.
+using Float3 = gmx::RVec;
+
+//! Convenience alias for 4-wide float.
+using Float4 = cl_float4;
+
 /*! \internal \brief
  * GPU kernels scheduling description. This is same in OpenCL/CUDA.
  * Provides reasonable defaults, one typically only needs to set the GPU stream
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
index 7e4eeca484..6cdad01019 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -577,12 +577,12 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     }
 
     /* HtoD x, q */
-    GMX_ASSERT(sizeof(float) == sizeof(*nbatom->x().data()),
-               "The size of the xyzq buffer element should be equal to the size of float4.");
+    static_assert(sizeof(float) == sizeof(*nbatom->x().data()),
+                  "The size of the xyzq buffer element should be equal to the size of float4.");
     copyToDeviceBuffer(&adat->xq,
-                       nbatom->x().data() + adat_begin * 4,
-                       adat_begin * 4,
-                       adat_len * 4,
+                       reinterpret_cast<const Float4*>(nbatom->x().data()) + adat_begin,
+                       adat_begin,
+                       adat_len,
                        deviceStream,
                        GpuApiCallBehavior::Async,
                        bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
@@ -984,10 +984,10 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     /* DtoH f */
     GMX_ASSERT(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
                "The host force buffer should be in single precision to match device data size.");
-    copyFromDeviceBuffer(&nbatom->out[0].f[adat_begin * DIM],
+    copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + adat_begin,
                          &adat->f,
-                         adat_begin * DIM,
-                         adat_len * DIM,
+                         adat_begin,
+                         adat_len,
                          deviceStream,
                          GpuApiCallBehavior::Async,
                          bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
@@ -1012,12 +1012,13 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
         /* DtoH fshift when virial is needed */
         if (stepWork.computeVirial)
         {
-            GMX_ASSERT(sizeof(*nb->nbst.fshift) == DIM * sizeof(float),
-                       "Sizes of host- and device-side shift vector elements should be the same.");
-            copyFromDeviceBuffer(reinterpret_cast<float*>(nb->nbst.fshift),
+            static_assert(
+                    sizeof(*nb->nbst.fshift) == sizeof(Float3),
+                    "Sizes of host- and device-side shift vector elements should be the same.");
+            copyFromDeviceBuffer(nb->nbst.fshift,
                                  &adat->fshift,
                                  0,
-                                 SHIFTS * DIM,
+                                 SHIFTS,
                                  deviceStream,
                                  GpuApiCallBehavior::Async,
                                  bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
@@ -1026,8 +1027,8 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
         /* DtoH energies */
         if (stepWork.computeEnergy)
         {
-            GMX_ASSERT(sizeof(*nb->nbst.e_lj) == sizeof(float),
-                       "Sizes of host- and device-side LJ energy terms should be the same.");
+            static_assert(sizeof(*nb->nbst.e_lj) == sizeof(float),
+                          "Sizes of host- and device-side LJ energy terms should be the same.");
             copyFromDeviceBuffer(nb->nbst.e_lj,
                                  &adat->e_lj,
                                  0,
@@ -1035,9 +1036,9 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
                                  deviceStream,
                                  GpuApiCallBehavior::Async,
                                  bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-            GMX_ASSERT(sizeof(*nb->nbst.e_el) == sizeof(float),
-                       "Sizes of host- and device-side electrostatic energy terms should be the "
-                       "same.");
+            static_assert(sizeof(*nb->nbst.e_el) == sizeof(float),
+                          "Sizes of host- and device-side electrostatic energy terms should be the "
+                          "same.");
             copyFromDeviceBuffer(nb->nbst.e_el,
                                  &adat->e_el,
                                  0,
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
index 7472b37c70..706c3a48d5 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -375,7 +375,7 @@ static void nbnxn_ocl_clear_f(NbnxmGpu* nb, int natoms_clear)
     cl_atomdata_t*      atomData    = nb->atdat;
     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 
-    clearDeviceBufferAsync(&atomData->f, 0, natoms_clear * DIM, localStream);
+    clearDeviceBufferAsync(&atomData->f, 0, natoms_clear, localStream);
 }
 
 //! This function is documented in the header file
@@ -404,12 +404,12 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
     /* only if we have a dynamic box */
     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
     {
-        GMX_ASSERT(sizeof(float) * DIM == sizeof(*nbatom->shift_vec.data()),
-                   "Sizes of host- and device-side shift vectors should be the same.");
+        static_assert(sizeof(Float3) == sizeof(nbatom->shift_vec[0]),
+                      "Sizes of host- and device-side shift vectors should be the same.");
         copyToDeviceBuffer(&adat->shift_vec,
-                           reinterpret_cast<const float*>(nbatom->shift_vec.data()),
+                           reinterpret_cast<const Float3*>(nbatom->shift_vec.data()),
                            0,
-                           SHIFTS * DIM,
+                           SHIFTS,
                            localStream,
                            GpuApiCallBehavior::Async,
                            nullptr);
@@ -454,13 +454,13 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
         }
 
 
-        allocateDeviceBuffer(&d_atdat->f, nalloc * DIM, deviceContext);
-        allocateDeviceBuffer(&d_atdat->xq, nalloc * (DIM + 1), deviceContext);
+        allocateDeviceBuffer(&d_atdat->f, nalloc, deviceContext);
+        allocateDeviceBuffer(&d_atdat->xq, nalloc, deviceContext);
 
         if (useLjCombRule(nb->nbparam->vdwType))
         {
             // Two Lennard-Jones parameters per atom
-            allocateDeviceBuffer(&d_atdat->lj_comb, nalloc * 2, deviceContext);
+            allocateDeviceBuffer(&d_atdat->lj_comb, nalloc, deviceContext);
         }
         else
         {
@@ -482,20 +482,20 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 
     if (useLjCombRule(nb->nbparam->vdwType))
     {
-        GMX_ASSERT(sizeof(float) == sizeof(*nbat->params().lj_comb.data()),
-                   "Size of the LJ parameters element should be equal to the size of float2.");
+        static_assert(sizeof(float) == sizeof(*nbat->params().lj_comb.data()),
+                      "Size of the LJ parameters element should be equal to the size of float2.");
         copyToDeviceBuffer(&d_atdat->lj_comb,
-                           nbat->params().lj_comb.data(),
+                           reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
                            0,
-                           2 * natoms,
+                           natoms,
                            localStream,
                            GpuApiCallBehavior::Async,
                            bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
     }
     else
     {
-        GMX_ASSERT(sizeof(int) == sizeof(*nbat->params().type.data()),
-                   "Sizes of host- and device-side atom types should be the same.");
+        static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
+                      "Sizes of host- and device-side atom types should be the same.");
         copyToDeviceBuffer(&d_atdat->atom_types,
                            nbat->params().type.data(),
                            0,
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
index 013345b32c..751e352962 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
@@ -87,7 +87,7 @@ struct nb_staging_t
     //! electrostatic energy
     float* e_el = nullptr;
     //! float3 buffer with shift forces
-    float (*fshift)[3] = nullptr;
+    Float3* fshift = nullptr;
 };
 
 /*! \internal
@@ -103,10 +103,10 @@ typedef struct cl_atomdata
     int nalloc;
 
     //! float4 buffer with atom coordinates + charges, size natoms
-    DeviceBuffer<float> xq;
+    DeviceBuffer<Float4> xq;
 
     //! float3 buffer with force output array, size natoms
-    DeviceBuffer<float> f;
+    DeviceBuffer<Float3> f;
 
     //! LJ energy output, size 1
     DeviceBuffer<float> e_lj;
@@ -114,17 +114,17 @@ typedef struct cl_atomdata
     DeviceBuffer<float> e_el;
 
     //! float3 buffer with shift forces
-    DeviceBuffer<float> fshift;
+    DeviceBuffer<Float3> fshift;
 
     //! number of atom types
     int ntypes;
     //! int buffer with atom type indices, size natoms
     DeviceBuffer<int> atom_types;
     //! float2 buffer with sqrt(c6),sqrt(c12), size natoms
-    DeviceBuffer<float> lj_comb;
+    DeviceBuffer<Float2> lj_comb;
 
     //! float3 buffer with shifts values
-    DeviceBuffer<float> shift_vec;
+    DeviceBuffer<Float3> shift_vec;
 
     //! true if the shift vector has been uploaded
     bool bShiftVecUploaded;