Add FloatN aliases to CUDA and use them in NBNXM
authorArtem Zhmurov <zhmurov@gmail.com>
Sat, 20 Feb 2021 08:12:02 +0000 (11:12 +0300)
committerPaul Bauer <paul.bauer.q@gmail.com>
Mon, 22 Feb 2021 08:19:42 +0000 (08:19 +0000)
These aliases are nessesary to unify OpenCL, CUDA and SYCL
code.

Refs #3312, #2608, #3311

src/gromacs/gpu_utils/gputraits.cuh
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh
src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h

index fec113b4b4c1f4bcd1414e1ee8cfd4d4478b687f..656e8a231969a5bc85da806e01ab120b04504799 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -45,6 +45,7 @@
  * \ingroup module_gpu_utils
  */
 #include <cuda_runtime.h>
+#include "gromacs/math/vectypes.h"
 
 //! Device texture for fast read-only data fetching
 using DeviceTexture = cudaTextureObject_t;
@@ -52,6 +53,15 @@ using DeviceTexture = cudaTextureObject_t;
 //! \brief Single GPU call timing event - meaningless in CUDA
 using CommandEvent = void;
 
+//! Convenience alias for 2-wide float
+using Float2 = float2;
+
+//! Convenience alias for 3-wide float
+using Float3 = gmx::RVec;
+
+//! Convenience alias for 4-wide float.
+using Float4 = float4;
+
 /*! \internal \brief
  * GPU kernels scheduling description. This is same in OpenCL/CUDA.
  * Provides reasonable defaults, one typically only needs to set the GPU stream
index 6796da5aac6660b63ea165d0397aadcc779d1ca4..62d50f039d40271de79585e8ac13379af5508ba9 100644 (file)
@@ -496,17 +496,17 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
         adat_len   = adat->natoms - adat->natoms_local;
     }
 
-    /* HtoD x, q */
     /* beginning of timed HtoD section */
     if (bDoTime)
     {
         t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
     }
 
-    static_assert(sizeof(adat->xq[0]) == sizeof(float4),
+    /* HtoD x, q */
+    static_assert(sizeof(adat->xq[0]) == sizeof(Float4),
                   "The size of the xyzq buffer element should be equal to the size of float4.");
     copyToDeviceBuffer(&adat->xq,
-                       reinterpret_cast<const float4*>(nbatom->x().data()) + adat_begin,
+                       reinterpret_cast<const Float4*>(nbatom->x().data()) + adat_begin,
                        adat_begin,
                        adat_len,
                        deviceStream,
@@ -845,9 +845,9 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     if (!stepWork.useGpuFBufferOps)
     {
         static_assert(
-                sizeof(adat->f[0]) == sizeof(float3),
+                sizeof(adat->f[0]) == sizeof(Float3),
                 "The size of the force buffer element should be equal to the size of float3.");
-        copyFromDeviceBuffer(reinterpret_cast<float3*>(nbatom->out[0].f.data()) + adat_begin,
+        copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + adat_begin,
                              &adat->f,
                              adat_begin,
                              adat_len,
index 7d1334144fc4342db4db557226593b4ef4ae56bf..dff7e4d21cf8e68f98c1a6e7852b7b9a67fe4ef0 100644 (file)
@@ -268,7 +268,7 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
         static_assert(sizeof(adat->shift_vec[0]) == sizeof(nbatom->shift_vec[0]),
                       "Sizes of host- and device-side shift vectors should be the same.");
         copyToDeviceBuffer(&adat->shift_vec,
-                           reinterpret_cast<const float3*>(nbatom->shift_vec.data()),
+                           reinterpret_cast<const Float3*>(nbatom->shift_vec.data()),
                            0,
                            SHIFTS,
                            localStream,
@@ -368,10 +368,10 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 
     if (useLjCombRule(nb->nbparam->vdwType))
     {
-        static_assert(sizeof(d_atdat->lj_comb[0]) == sizeof(float2),
+        static_assert(sizeof(d_atdat->lj_comb[0]) == sizeof(Float2),
                       "Size of the LJ parameters element should be equal to the size of float2.");
         copyToDeviceBuffer(&d_atdat->lj_comb,
-                           reinterpret_cast<const float2*>(nbat->params().lj_comb.data()),
+                           reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
                            0,
                            natoms,
                            localStream,
index 0ff57b25b120cff746e1d06d00a2a1177eb729b4..688e094715ea6912efb5bb918637ffa153a052dd 100644 (file)
@@ -48,6 +48,7 @@
 
 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
 #include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+#include "gromacs/gpu_utils/typecasts.cuh"
 #include "gromacs/math/utilities.h"
 #include "gromacs/pbcutil/ishift.h"
 /* Note that floating-point constants in CUDA code should be suffixed
@@ -178,8 +179,8 @@ __launch_bounds__(THREADS_PER_BLOCK)
     float2        ljcp_i, ljcp_j;
 #    endif
     const float4*        xq          = atdat.xq;
-    float3*              f           = atdat.f;
-    const float3*        shift_vec   = atdat.shift_vec;
+    float3*              f           = asFloat3(atdat.f);
+    const float3*        shift_vec   = asFloat3(atdat.shift_vec);
     float                rcoulomb_sq = nbparam.rcoulomb_sq;
 #    ifdef VDW_CUTOFF_CHECK
     float                rvdw_sq     = nbparam.rvdw_sq;
@@ -648,7 +649,8 @@ __launch_bounds__(THREADS_PER_BLOCK)
     /* add up local shift forces into global mem, tidxj indexes x,y,z */
     if (bCalcFshift && (tidxj & 3) < 3)
     {
-        atomicAdd(&(atdat.fshift[nb_sci.shift].x) + (tidxj & 3), fshift_buf);
+        float3* fshift = asFloat3(atdat.fshift);
+        atomicAdd(&(fshift[nb_sci.shift].x) + (tidxj & 3), fshift_buf);
     }
 
 #    ifdef CALC_ENERGIES
index 563e1edc0c74b4faa13492f3d2af70b19c5a5d4d..11c51227f4e8b04598ea9ac7257b2a856c12a23d 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -47,6 +47,7 @@
 #include "gmxpre.h"
 
 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#include "gromacs/gpu_utils/typecasts.cuh"
 #include "gromacs/math/utilities.h"
 #include "gromacs/pbcutil/ishift.h"
 
@@ -124,7 +125,7 @@ nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const NBParamGpu, const Nbnx
     const nbnxn_sci_t* pl_sci    = plist.sci;
     nbnxn_cj4_t*       pl_cj4    = plist.cj4;
     const float4*      xq        = atdat.xq;
-    const float3*      shift_vec = atdat.shift_vec;
+    const float3*      shift_vec = asFloat3(atdat.shift_vec);
 
     float rlistOuter_sq = nbparam.rlistOuter_sq;
     float rlistInner_sq = nbparam.rlistInner_sq;
index 65a247ad08408a498790eca6d971d8f1fd361df5..f1b1a6db8178a2a20aeb71ae4b251d815edf08a8 100644 (file)
@@ -84,7 +84,7 @@ struct nb_staging_t
     //! electrostatic energy
     float* e_el = nullptr;
     //! shift forces
-    float3* fshift = nullptr;
+    Float3* fshift = nullptr;
 };
 
 /** \internal
@@ -100,9 +100,9 @@ struct cu_atomdata
     int nalloc;
 
     //! atom coordinates + charges, size natoms
-    DeviceBuffer<float4> xq;
+    DeviceBuffer<Float4> xq;
     //! force output array, size natoms
-    DeviceBuffer<float3> f;
+    DeviceBuffer<Float3> f;
 
     //! LJ energy output, size 1
     DeviceBuffer<float> e_lj;
@@ -110,17 +110,17 @@ struct cu_atomdata
     DeviceBuffer<float> e_el;
 
     //! shift forces
-    DeviceBuffer<float3> fshift;
+    DeviceBuffer<Float3> fshift;
 
     //! number of atom types
     int ntypes;
     //! atom type indices, size natoms
     DeviceBuffer<int> atom_types;
     //! sqrt(c6),sqrt(c12) size natoms
-    DeviceBuffer<float2> lj_comb;
+    DeviceBuffer<Float2> lj_comb;
 
     //! shifts
-    DeviceBuffer<float3> shift_vec;
+    DeviceBuffer<Float3> shift_vec;
     //! true if the shift vector has been uploaded
     bool bShiftVecUploaded;
 };