Rename our SYCL aliases floatN to FloatN

author Andrey Alekseenko <al42and@gmail.com>

Thu, 18 Feb 2021 10:09:14 +0000 (13:09 +0300)

committer Joe Jordan <ejjordan12@gmail.com>

Mon, 22 Feb 2021 07:49:54 +0000 (07:49 +0000)
author Andrey Alekseenko <al42and@gmail.com>
Thu, 18 Feb 2021 10:09:14 +0000 (13:09 +0300)
committer Joe Jordan <ejjordan12@gmail.com>
Mon, 22 Feb 2021 07:49:54 +0000 (07:49 +0000)
diff --git a/src/gromacs/gpu_utils/devicebuffer_sycl.h b/src/gromacs/gpu_utils/devicebuffer_sycl.h

index 31898e00eb95ae847cc2ba0b27aa87b87af8da63..222f08c20b7b770376d21db0e679e3e594a5e549 100644 (file)
--- a/src/gromacs/gpu_utils/devicebuffer_sycl.h
+++ b/src/gromacs/gpu_utils/devicebuffer_sycl.h
@@ -383,7 +383,7 @@ cl::sycl::event fillSyclBufferWithNull(cl::sycl::buffer<ValueType, 1>& buffer,
  
  //! \brief Helper function to clear device buffer of type float3.
  template<>
-inline cl::sycl::event fillSyclBufferWithNull(cl::sycl::buffer<float3, 1>& buffer,
+inline cl::sycl::event fillSyclBufferWithNull(cl::sycl::buffer<Float3, 1>& buffer,
                                                size_t                       startingOffset,
                                                size_t                       numValues,
                                                cl::sycl::queue              queue)
diff --git a/src/gromacs/gpu_utils/gputraits_sycl.h b/src/gromacs/gpu_utils/gputraits_sycl.h

index d53552e3c2e4849d1b2fb8ea738d7274ad8cf90e..9c64d3f303ba2b875c7708c5fca4d7e1952cf874 100644 (file)
--- a/src/gromacs/gpu_utils/gputraits_sycl.h
+++ b/src/gromacs/gpu_utils/gputraits_sycl.h
@@ -54,14 +54,13 @@ using DeviceTexture = void*;
  //! \brief Single GPU call timing event, not used with SYCL
  using CommandEvent = void*;
  
+// TODO: Issue #3312
  //! Convenience alias.
-using float4 = cl::sycl::float4;
-
+using Float4 = cl::sycl::float4;
  //! Convenience alias. Not using cl::sycl::float3 due to alignment issues.
-using float3 = gmx::RVec;
-
+using Float3 = gmx::RVec;
  //! Convenience alias for cl::sycl::float2
-using float2 = cl::sycl::float2;
+using Float2 = cl::sycl::float2;
  
  /*! \internal \brief
   * GPU kernels scheduling description. This is same in OpenCL/CUDA.
diff --git a/src/gromacs/mdlib/leapfrog_gpu.h b/src/gromacs/mdlib/leapfrog_gpu.h

index 108259f1c677e1804c7eddd6db4f19a4f6c1826b..d7c77ff756f2ec8150e906a0e6ebe1233e31df28 100644 (file)
--- a/src/gromacs/mdlib/leapfrog_gpu.h
+++ b/src/gromacs/mdlib/leapfrog_gpu.h
@@ -53,6 +53,7 @@
  #if GMX_GPU_SYCL
  #    include "gromacs/gpu_utils/devicebuffer_sycl.h"
  #    include "gromacs/gpu_utils/gputraits_sycl.h"
+using float3 = Float3;
  #endif
  
  #include <memory>
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp

index 1e8e0f34b3825d7a7b3218623c2234e2866be9a5..c313deb1106215f7db59f6b1618240231ebf1fbb 100644 (file)
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp
@@ -111,9 +111,9 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
       */
      if (!stepWork.useGpuFBufferOps)
      {
-        GMX_ASSERT(adat->f.elementSize() == sizeof(float3),
+        GMX_ASSERT(adat->f.elementSize() == sizeof(Float3),
                     "The size of the force buffer element should be equal to the size of float3.");
-        copyFromDeviceBuffer(reinterpret_cast<float3*>(nbatom->out[0].f.data()) + adatBegin,
+        copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + adatBegin,
                               &adat->f,
                               adatBegin,
                               adatLen,
@@ -197,10 +197,10 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
      getGpuAtomRange(adat, atomLocality, &adatBegin, &adatLen);
  
      /* HtoD x, q */
-    GMX_ASSERT(adat->xq.elementSize() == sizeof(float4),
+    GMX_ASSERT(adat->xq.elementSize() == sizeof(Float4),
                 "The size of the xyzq buffer element should be equal to the size of float4.");
      copyToDeviceBuffer(&adat->xq,
-                       reinterpret_cast<const float4*>(nbatom->x().data()) + adatBegin,
+                       reinterpret_cast<const Float4*>(nbatom->x().data()) + adatBegin,
                         adatBegin,
                         adatLen,
                         deviceStream,
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp

index ec538a1fc3e3c83d18ca60c1089b897e694c28fe..a3d8626fc431da1b64d3dd4deebe1cbbc64a63e2 100644 (file)
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp
@@ -204,7 +204,7 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
          GMX_ASSERT(adat->shiftVec.elementSize() == sizeof(nbatom->shift_vec[0]),
                     "Sizes of host- and device-side shift vectors should be the same.");
          copyToDeviceBuffer(&adat->shiftVec,
-                           reinterpret_cast<const float3*>(nbatom->shift_vec.data()),
+                           reinterpret_cast<const Float3*>(nbatom->shift_vec.data()),
                             0,
                             SHIFTS,
                             localStream,
@@ -262,10 +262,10 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
  
      if (useLjCombRule(nb->nbparam->vdwType))
      {
-        GMX_ASSERT(atdat->ljComb.elementSize() == sizeof(float2),
+        GMX_ASSERT(atdat->ljComb.elementSize() == sizeof(Float2),
                     "Size of the LJ parameters element should be equal to the size of float2.");
          copyToDeviceBuffer(&atdat->ljComb,
-                           reinterpret_cast<const float2*>(nbat->params().lj_comb.data()),
+                           reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
                             0,
                             numAtoms,
                             localStream,
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp

index a68c9d8b2f026346f881ff941e3da9cfea4bdc5b..946eb2dd4d955b9680908ddaed0f2a8d018841a7 100644 (file)
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
@@ -301,7 +301,7 @@ static inline float interpolateCoulombForceR(const DeviceAccessor<float, mode::r
      return lerp(left, right, fraction); // TODO: cl::sycl::mix
  }
  
-static inline void reduceForceJShuffle(float3                                  f,
+static inline void reduceForceJShuffle(Float3                                  f,
                                         const cl::sycl::nd_item<1>              itemIdx,
                                         const int                               tidxi,
                                         const int                               aidx,
@@ -342,7 +342,7 @@ static inline void reduceForceJShuffle(float3                                  f
   * This implementation works only with power of two array sizes.
   */
  static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
-                                         const float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster],
+                                         const Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster],
                                           const bool   calcFShift,
                                           const cl::sycl::nd_item<1>              itemIdx,
                                           const int                               tidxi,
@@ -418,16 +418,16 @@ static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read
   */
  template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
  auto nbnxmKernel(cl::sycl::handler&                                        cgh,
-                 DeviceAccessor<float4, mode::read>                        a_xq,
+                 DeviceAccessor<Float4, mode::read>                        a_xq,
                   DeviceAccessor<float, mode::read_write>                   a_f,
-                 DeviceAccessor<float3, mode::read>                        a_shiftVec,
+                 DeviceAccessor<Float3, mode::read>                        a_shiftVec,
                   DeviceAccessor<float, mode::read_write>                   a_fShift,
                   OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyElec,
                   OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyVdw,
                   DeviceAccessor<nbnxn_cj4_t, doPruneNBL ? mode::read_write : mode::read> a_plistCJ4,
                   DeviceAccessor<nbnxn_sci_t, mode::read>                                 a_plistSci,
                   DeviceAccessor<nbnxn_excl_t, mode::read>                    a_plistExcl,
-                 OptionalAccessor<float2, mode::read, ljComb<vdwType>>       a_ljComb,
+                 OptionalAccessor<Float2, mode::read, ljComb<vdwType>>       a_ljComb,
                   OptionalAccessor<int, mode::read, !ljComb<vdwType>>         a_atomTypes,
                   OptionalAccessor<float, mode::read, !ljComb<vdwType>>       a_nbfp,
                   OptionalAccessor<float, mode::read, ljEwald<vdwType>>       a_nbfpComb,
@@ -483,7 +483,7 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
      }
  
      // shmem buffer for i x+q pre-loading
-    cl::sycl::accessor<float4, 2, mode::read_write, target::local> sm_xq(
+    cl::sycl::accessor<Float4, 2, mode::read_write, target::local> sm_xq(
              cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
  
      // shmem buffer for force reduction
@@ -506,7 +506,7 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
      auto sm_ljCombI = [&]() {
          if constexpr (props.vdwComb)
          {
-            return cl::sycl::accessor<float2, 2, mode::read_write, target::local>(
+            return cl::sycl::accessor<Float2, 2, mode::read_write, target::local>(
                      cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
          }
          else
@@ -539,10 +539,10 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
          // Better use sg.get_group_range, but too much of the logic relies on it anyway
          const unsigned widx = tidx / subGroupSize;
  
-        float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster]; // i force buffer
+        Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster]; // i force buffer
          for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
          {
-            fCiBuf[i] = float3(0.0F, 0.0F, 0.0F);
+            fCiBuf[i] = Float3(0.0F, 0.0F, 0.0F);
          }
  
          const nbnxn_sci_t nbSci     = a_plistSci[bidx];
@@ -561,9 +561,9 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
              const int             ai       = ci * c_clSize + tidxi;
              const cl::sycl::id<2> cacheIdx = cl::sycl::id<2>(tidxj + i, tidxi);
  
-            const float3 shift = a_shiftVec[nbSci.shift];
-            float4       xqi   = a_xq[ai];
-            xqi += float4(shift[0], shift[1], shift[2], 0.0F);
+            const Float3 shift = a_shiftVec[nbSci.shift];
+            Float4       xqi   = a_xq[ai];
+            xqi += Float4(shift[0], shift[1], shift[2], 0.0F);
              xqi[3] *= epsFac;
              sm_xq[cacheIdx] = xqi;
  
@@ -659,12 +659,12 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
                  const int aj     = cj * c_clSize + tidxj;
  
                  // load j atom data
-                const float4 xqj = a_xq[aj];
+                const Float4 xqj = a_xq[aj];
  
-                const float3 xj(xqj[0], xqj[1], xqj[2]);
+                const Float3 xj(xqj[0], xqj[1], xqj[2]);
                  const float  qj = xqj[3];
                  int          atomTypeJ; // Only needed if (!props.vdwComb)
-                float2       ljCombJ;   // Only needed if (props.vdwComb)
+                Float2       ljCombJ;   // Only needed if (props.vdwComb)
                  if constexpr (props.vdwComb)
                  {
                      ljCombJ = a_ljComb[aj];
@@ -674,7 +674,7 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
                      atomTypeJ = a_atomTypes[aj];
                  }
  
-                float3 fCjBuf(0.0F, 0.0F, 0.0F);
+                Float3 fCjBuf(0.0F, 0.0F, 0.0F);
  
                  for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
                  {
@@ -683,11 +683,11 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
                          // i cluster index
                          const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + i;
                          // all threads load an atom from i cluster ci into shmem!
-                        const float4 xqi = sm_xq[i][tidxi];
-                        const float3 xi(xqi[0], xqi[1], xqi[2]);
+                        const Float4 xqi = sm_xq[i][tidxi];
+                        const Float3 xi(xqi[0], xqi[1], xqi[2]);
  
                          // distance between i and j atoms
-                        const float3 rv = xi - xj;
+                        const Float3 rv = xi - xj;
                          float        r2 = norm2(rv);
  
                          if constexpr (doPruneNBL)
@@ -724,7 +724,7 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
                              }
                              else
                              {
-                                const float2 ljCombI = sm_ljCombI[i][tidxi];
+                                const Float2 ljCombI = sm_ljCombI[i][tidxi];
                                  if constexpr (props.vdwCombGeom)
                                  {
                                      c6  = ljCombI[0] * ljCombJ[0];
@@ -867,7 +867,7 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
                                  }
                              }
  
-                            const float3 forceIJ = rv * fInvR;
+                            const Float3 forceIJ = rv * fInvR;
  
                              /* accumulate j forces in registers */
                              fCjBuf -= forceIJ;
@@ -969,9 +969,9 @@ void launchNbnxmKernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In
      const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
  
      // Casting to float simplifies using atomic ops in the kernel
-    cl::sycl::buffer<float3, 1> f(*adat->f.buffer_);
+    cl::sycl::buffer<Float3, 1> f(*adat->f.buffer_);
      auto                        fAsFloat = f.reinterpret<float, 1>(f.get_count() * DIM);
-    cl::sycl::buffer<float3, 1> fShift(*adat->fShift.buffer_);
+    cl::sycl::buffer<Float3, 1> fShift(*adat->fShift.buffer_);
      auto fShiftAsFloat = fShift.reinterpret<float, 1>(fShift.get_count() * DIM);
  
      cl::sycl::event e = chooseAndLaunchNbnxmKernel(doPruneNBL,
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp

index cb88eb31d6fb3a7d267ea800b1a57755d98894ef..cf9ce2f67d45d0fb602b002fdc046718456b4ed9 100644 (file)
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
@@ -62,8 +62,8 @@ namespace Nbnxm
   */
  template<bool haveFreshList>
  auto nbnxmKernelPruneOnly(cl::sycl::handler&                            cgh,
-                          DeviceAccessor<float4, mode::read>            a_xq,
-                          DeviceAccessor<float3, mode::read>            a_shiftVec,
+                          DeviceAccessor<Float4, mode::read>            a_xq,
+                          DeviceAccessor<Float3, mode::read>            a_shiftVec,
                            DeviceAccessor<nbnxn_cj4_t, mode::read_write> a_plistCJ4,
                            DeviceAccessor<nbnxn_sci_t, mode::read>       a_plistSci,
                            DeviceAccessor<unsigned int, haveFreshList ? mode::write : mode::read> a_plistIMask,
@@ -79,7 +79,7 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler&                            cgh,
      cgh.require(a_plistIMask);
  
      /* shmem buffer for i x+q pre-loading */
-    cl::sycl::accessor<float4, 2, mode::read_write, target::local> sm_xq(
+    cl::sycl::accessor<Float4, 2, mode::read_write, target::local> sm_xq(
              cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
  
      constexpr int warpSize = c_clSize * c_clSize / 2;
@@ -125,9 +125,9 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler&                            cgh,
  
                  /* We don't need q, but using float4 in shmem avoids bank conflicts.
                     (but it also wastes L2 bandwidth). */
-                const float4 xq    = a_xq[ai];
-                const float3 shift = a_shiftVec[nbSci.shift];
-                const float4 xi(xq[0] + shift[0], xq[1] + shift[1], xq[2] + shift[2], xq[3]);
+                const Float4 xq    = a_xq[ai];
+                const Float3 shift = a_shiftVec[nbSci.shift];
+                const Float4 xi(xq[0] + shift[0], xq[1] + shift[1], xq[2] + shift[2], xq[3]);
                  sm_xq[tidxj + i][tidxi] = xi;
              }
          }
@@ -170,17 +170,17 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler&                            cgh,
                          const int aj = cj * c_clSize + tidxj;
  
                          /* load j atom data */
-                        const float4 tmp = a_xq[aj];
-                        const float3 xj(tmp[0], tmp[1], tmp[2]);
+                        const Float4 tmp = a_xq[aj];
+                        const Float3 xj(tmp[0], tmp[1], tmp[2]);
  
                          for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
                          {
                              if (imaskCheck & mask_ji)
                              {
                                  // load i-cluster coordinates from shmem
-                                const float4 xi = sm_xq[i][tidxi];
+                                const Float4 xi = sm_xq[i][tidxi];
                                  // distance between i and j atoms
-                                float3 rv(xi[0], xi[1], xi[2]);
+                                Float3 rv(xi[0], xi[1], xi[2]);
                                  rv -= xj;
                                  const float r2 = norm2(rv);
  
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h

index 4e5e328f5658118a1803d9525f4280a5b93e012b..ba14d9b86740813616545e01bdbfdc92c626e4c6 100644 (file)
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h
@@ -68,7 +68,7 @@ struct nb_staging_t
      //! electrostatic energy
      float* e_el = nullptr;
      //! shift forces
-    float3* fshift = nullptr;
+    Float3* fshift = nullptr;
  };
  
  /** \internal
@@ -84,9 +84,9 @@ struct sycl_atomdata_t
      int numAlloc;
  
      //! atom coordinates + charges, size \ref natoms
-    DeviceBuffer<float4> xq;
+    DeviceBuffer<Float4> xq;
      //! force output array, size \ref natoms
-    DeviceBuffer<float3> f;
+    DeviceBuffer<Float3> f;
  
      //! LJ energy output, size 1
      DeviceBuffer<float> eLJ;
@@ -94,17 +94,17 @@ struct sycl_atomdata_t
      DeviceBuffer<float> eElec;
  
      //! shift forces
-    DeviceBuffer<float3> fShift;
+    DeviceBuffer<Float3> fShift;
  
      //! number of atom types
      int numTypes;
      //! atom type indices, size \ref natoms
      DeviceBuffer<int> atomTypes;
      //! sqrt(c6),sqrt(c12) size \ref natoms
-    DeviceBuffer<float2> ljComb;
+    DeviceBuffer<Float2> ljComb;
  
      //! shifts
-    DeviceBuffer<float3> shiftVec;
+    DeviceBuffer<Float3> shiftVec;
      //! true if the shift vector has been uploaded
      bool shiftVecUploaded;
  };
author	Andrey Alekseenko <al42and@gmail.com>
	Thu, 18 Feb 2021 10:09:14 +0000 (13:09 +0300)
committer	Joe Jordan <ejjordan12@gmail.com>
	Mon, 22 Feb 2021 07:49:54 +0000 (07:49 +0000)
src/gromacs/gpu_utils/devicebuffer_sycl.h		patch \| blob \| history
src/gromacs/gpu_utils/gputraits_sycl.h		patch \| blob \| history
src/gromacs/mdlib/leapfrog_gpu.h		patch \| blob \| history
src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp		patch \| blob \| history
src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp		patch \| blob \| history
src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp		patch \| blob \| history
src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp		patch \| blob \| history
src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h		patch \| blob \| history