Rename our SYCL aliases floatN to FloatN
authorAndrey Alekseenko <al42and@gmail.com>
Thu, 18 Feb 2021 10:09:14 +0000 (13:09 +0300)
committerJoe Jordan <ejjordan12@gmail.com>
Mon, 22 Feb 2021 07:49:54 +0000 (07:49 +0000)
Main reasons:

- Clearly separate our types (or aliases) from native types. This will
  likely make later changes (e.g., in scope of #3312) easier.
- Enable hipSYCL build by avoiding clashes of multiple floatN in the
  global namespace.

Refs #3312, #3923

src/gromacs/gpu_utils/devicebuffer_sycl.h
src/gromacs/gpu_utils/gputraits_sycl.h
src/gromacs/mdlib/leapfrog_gpu.h
src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp
src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp
src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h

index 31898e00eb95ae847cc2ba0b27aa87b87af8da63..222f08c20b7b770376d21db0e679e3e594a5e549 100644 (file)
@@ -383,7 +383,7 @@ cl::sycl::event fillSyclBufferWithNull(cl::sycl::buffer<ValueType, 1>& buffer,
 
 //! \brief Helper function to clear device buffer of type float3.
 template<>
-inline cl::sycl::event fillSyclBufferWithNull(cl::sycl::buffer<float3, 1>& buffer,
+inline cl::sycl::event fillSyclBufferWithNull(cl::sycl::buffer<Float3, 1>& buffer,
                                               size_t                       startingOffset,
                                               size_t                       numValues,
                                               cl::sycl::queue              queue)
index d53552e3c2e4849d1b2fb8ea738d7274ad8cf90e..9c64d3f303ba2b875c7708c5fca4d7e1952cf874 100644 (file)
@@ -54,14 +54,13 @@ using DeviceTexture = void*;
 //! \brief Single GPU call timing event, not used with SYCL
 using CommandEvent = void*;
 
+// TODO: Issue #3312
 //! Convenience alias.
-using float4 = cl::sycl::float4;
-
+using Float4 = cl::sycl::float4;
 //! Convenience alias. Not using cl::sycl::float3 due to alignment issues.
-using float3 = gmx::RVec;
-
+using Float3 = gmx::RVec;
 //! Convenience alias for cl::sycl::float2
-using float2 = cl::sycl::float2;
+using Float2 = cl::sycl::float2;
 
 /*! \internal \brief
  * GPU kernels scheduling description. This is same in OpenCL/CUDA.
index 108259f1c677e1804c7eddd6db4f19a4f6c1826b..d7c77ff756f2ec8150e906a0e6ebe1233e31df28 100644 (file)
@@ -53,6 +53,7 @@
 #if GMX_GPU_SYCL
 #    include "gromacs/gpu_utils/devicebuffer_sycl.h"
 #    include "gromacs/gpu_utils/gputraits_sycl.h"
+using float3 = Float3;
 #endif
 
 #include <memory>
index 1e8e0f34b3825d7a7b3218623c2234e2866be9a5..c313deb1106215f7db59f6b1618240231ebf1fbb 100644 (file)
@@ -111,9 +111,9 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
      */
     if (!stepWork.useGpuFBufferOps)
     {
-        GMX_ASSERT(adat->f.elementSize() == sizeof(float3),
+        GMX_ASSERT(adat->f.elementSize() == sizeof(Float3),
                    "The size of the force buffer element should be equal to the size of float3.");
-        copyFromDeviceBuffer(reinterpret_cast<float3*>(nbatom->out[0].f.data()) + adatBegin,
+        copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + adatBegin,
                              &adat->f,
                              adatBegin,
                              adatLen,
@@ -197,10 +197,10 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     getGpuAtomRange(adat, atomLocality, &adatBegin, &adatLen);
 
     /* HtoD x, q */
-    GMX_ASSERT(adat->xq.elementSize() == sizeof(float4),
+    GMX_ASSERT(adat->xq.elementSize() == sizeof(Float4),
                "The size of the xyzq buffer element should be equal to the size of float4.");
     copyToDeviceBuffer(&adat->xq,
-                       reinterpret_cast<const float4*>(nbatom->x().data()) + adatBegin,
+                       reinterpret_cast<const Float4*>(nbatom->x().data()) + adatBegin,
                        adatBegin,
                        adatLen,
                        deviceStream,
index ec538a1fc3e3c83d18ca60c1089b897e694c28fe..a3d8626fc431da1b64d3dd4deebe1cbbc64a63e2 100644 (file)
@@ -204,7 +204,7 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
         GMX_ASSERT(adat->shiftVec.elementSize() == sizeof(nbatom->shift_vec[0]),
                    "Sizes of host- and device-side shift vectors should be the same.");
         copyToDeviceBuffer(&adat->shiftVec,
-                           reinterpret_cast<const float3*>(nbatom->shift_vec.data()),
+                           reinterpret_cast<const Float3*>(nbatom->shift_vec.data()),
                            0,
                            SHIFTS,
                            localStream,
@@ -262,10 +262,10 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 
     if (useLjCombRule(nb->nbparam->vdwType))
     {
-        GMX_ASSERT(atdat->ljComb.elementSize() == sizeof(float2),
+        GMX_ASSERT(atdat->ljComb.elementSize() == sizeof(Float2),
                    "Size of the LJ parameters element should be equal to the size of float2.");
         copyToDeviceBuffer(&atdat->ljComb,
-                           reinterpret_cast<const float2*>(nbat->params().lj_comb.data()),
+                           reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
                            0,
                            numAtoms,
                            localStream,
index a68c9d8b2f026346f881ff941e3da9cfea4bdc5b..946eb2dd4d955b9680908ddaed0f2a8d018841a7 100644 (file)
@@ -301,7 +301,7 @@ static inline float interpolateCoulombForceR(const DeviceAccessor<float, mode::r
     return lerp(left, right, fraction); // TODO: cl::sycl::mix
 }
 
-static inline void reduceForceJShuffle(float3                                  f,
+static inline void reduceForceJShuffle(Float3                                  f,
                                        const cl::sycl::nd_item<1>              itemIdx,
                                        const int                               tidxi,
                                        const int                               aidx,
@@ -342,7 +342,7 @@ static inline void reduceForceJShuffle(float3                                  f
  * This implementation works only with power of two array sizes.
  */
 static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
-                                         const float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster],
+                                         const Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster],
                                          const bool   calcFShift,
                                          const cl::sycl::nd_item<1>              itemIdx,
                                          const int                               tidxi,
@@ -418,16 +418,16 @@ static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read
  */
 template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
 auto nbnxmKernel(cl::sycl::handler&                                        cgh,
-                 DeviceAccessor<float4, mode::read>                        a_xq,
+                 DeviceAccessor<Float4, mode::read>                        a_xq,
                  DeviceAccessor<float, mode::read_write>                   a_f,
-                 DeviceAccessor<float3, mode::read>                        a_shiftVec,
+                 DeviceAccessor<Float3, mode::read>                        a_shiftVec,
                  DeviceAccessor<float, mode::read_write>                   a_fShift,
                  OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyElec,
                  OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyVdw,
                  DeviceAccessor<nbnxn_cj4_t, doPruneNBL ? mode::read_write : mode::read> a_plistCJ4,
                  DeviceAccessor<nbnxn_sci_t, mode::read>                                 a_plistSci,
                  DeviceAccessor<nbnxn_excl_t, mode::read>                    a_plistExcl,
-                 OptionalAccessor<float2, mode::read, ljComb<vdwType>>       a_ljComb,
+                 OptionalAccessor<Float2, mode::read, ljComb<vdwType>>       a_ljComb,
                  OptionalAccessor<int, mode::read, !ljComb<vdwType>>         a_atomTypes,
                  OptionalAccessor<float, mode::read, !ljComb<vdwType>>       a_nbfp,
                  OptionalAccessor<float, mode::read, ljEwald<vdwType>>       a_nbfpComb,
@@ -483,7 +483,7 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
     }
 
     // shmem buffer for i x+q pre-loading
-    cl::sycl::accessor<float4, 2, mode::read_write, target::local> sm_xq(
+    cl::sycl::accessor<Float4, 2, mode::read_write, target::local> sm_xq(
             cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
 
     // shmem buffer for force reduction
@@ -506,7 +506,7 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
     auto sm_ljCombI = [&]() {
         if constexpr (props.vdwComb)
         {
-            return cl::sycl::accessor<float2, 2, mode::read_write, target::local>(
+            return cl::sycl::accessor<Float2, 2, mode::read_write, target::local>(
                     cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
         }
         else
@@ -539,10 +539,10 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
         // Better use sg.get_group_range, but too much of the logic relies on it anyway
         const unsigned widx = tidx / subGroupSize;
 
-        float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster]; // i force buffer
+        Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster]; // i force buffer
         for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
         {
-            fCiBuf[i] = float3(0.0F, 0.0F, 0.0F);
+            fCiBuf[i] = Float3(0.0F, 0.0F, 0.0F);
         }
 
         const nbnxn_sci_t nbSci     = a_plistSci[bidx];
@@ -561,9 +561,9 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
             const int             ai       = ci * c_clSize + tidxi;
             const cl::sycl::id<2> cacheIdx = cl::sycl::id<2>(tidxj + i, tidxi);
 
-            const float3 shift = a_shiftVec[nbSci.shift];
-            float4       xqi   = a_xq[ai];
-            xqi += float4(shift[0], shift[1], shift[2], 0.0F);
+            const Float3 shift = a_shiftVec[nbSci.shift];
+            Float4       xqi   = a_xq[ai];
+            xqi += Float4(shift[0], shift[1], shift[2], 0.0F);
             xqi[3] *= epsFac;
             sm_xq[cacheIdx] = xqi;
 
@@ -659,12 +659,12 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
                 const int aj     = cj * c_clSize + tidxj;
 
                 // load j atom data
-                const float4 xqj = a_xq[aj];
+                const Float4 xqj = a_xq[aj];
 
-                const float3 xj(xqj[0], xqj[1], xqj[2]);
+                const Float3 xj(xqj[0], xqj[1], xqj[2]);
                 const float  qj = xqj[3];
                 int          atomTypeJ; // Only needed if (!props.vdwComb)
-                float2       ljCombJ;   // Only needed if (props.vdwComb)
+                Float2       ljCombJ;   // Only needed if (props.vdwComb)
                 if constexpr (props.vdwComb)
                 {
                     ljCombJ = a_ljComb[aj];
@@ -674,7 +674,7 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
                     atomTypeJ = a_atomTypes[aj];
                 }
 
-                float3 fCjBuf(0.0F, 0.0F, 0.0F);
+                Float3 fCjBuf(0.0F, 0.0F, 0.0F);
 
                 for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
                 {
@@ -683,11 +683,11 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
                         // i cluster index
                         const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + i;
                         // all threads load an atom from i cluster ci into shmem!
-                        const float4 xqi = sm_xq[i][tidxi];
-                        const float3 xi(xqi[0], xqi[1], xqi[2]);
+                        const Float4 xqi = sm_xq[i][tidxi];
+                        const Float3 xi(xqi[0], xqi[1], xqi[2]);
 
                         // distance between i and j atoms
-                        const float3 rv = xi - xj;
+                        const Float3 rv = xi - xj;
                         float        r2 = norm2(rv);
 
                         if constexpr (doPruneNBL)
@@ -724,7 +724,7 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
                             }
                             else
                             {
-                                const float2 ljCombI = sm_ljCombI[i][tidxi];
+                                const Float2 ljCombI = sm_ljCombI[i][tidxi];
                                 if constexpr (props.vdwCombGeom)
                                 {
                                     c6  = ljCombI[0] * ljCombJ[0];
@@ -867,7 +867,7 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
                                 }
                             }
 
-                            const float3 forceIJ = rv * fInvR;
+                            const Float3 forceIJ = rv * fInvR;
 
                             /* accumulate j forces in registers */
                             fCjBuf -= forceIJ;
@@ -969,9 +969,9 @@ void launchNbnxmKernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In
     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     // Casting to float simplifies using atomic ops in the kernel
-    cl::sycl::buffer<float3, 1> f(*adat->f.buffer_);
+    cl::sycl::buffer<Float3, 1> f(*adat->f.buffer_);
     auto                        fAsFloat = f.reinterpret<float, 1>(f.get_count() * DIM);
-    cl::sycl::buffer<float3, 1> fShift(*adat->fShift.buffer_);
+    cl::sycl::buffer<Float3, 1> fShift(*adat->fShift.buffer_);
     auto fShiftAsFloat = fShift.reinterpret<float, 1>(fShift.get_count() * DIM);
 
     cl::sycl::event e = chooseAndLaunchNbnxmKernel(doPruneNBL,
index cb88eb31d6fb3a7d267ea800b1a57755d98894ef..cf9ce2f67d45d0fb602b002fdc046718456b4ed9 100644 (file)
@@ -62,8 +62,8 @@ namespace Nbnxm
  */
 template<bool haveFreshList>
 auto nbnxmKernelPruneOnly(cl::sycl::handler&                            cgh,
-                          DeviceAccessor<float4, mode::read>            a_xq,
-                          DeviceAccessor<float3, mode::read>            a_shiftVec,
+                          DeviceAccessor<Float4, mode::read>            a_xq,
+                          DeviceAccessor<Float3, mode::read>            a_shiftVec,
                           DeviceAccessor<nbnxn_cj4_t, mode::read_write> a_plistCJ4,
                           DeviceAccessor<nbnxn_sci_t, mode::read>       a_plistSci,
                           DeviceAccessor<unsigned int, haveFreshList ? mode::write : mode::read> a_plistIMask,
@@ -79,7 +79,7 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler&                            cgh,
     cgh.require(a_plistIMask);
 
     /* shmem buffer for i x+q pre-loading */
-    cl::sycl::accessor<float4, 2, mode::read_write, target::local> sm_xq(
+    cl::sycl::accessor<Float4, 2, mode::read_write, target::local> sm_xq(
             cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
 
     constexpr int warpSize = c_clSize * c_clSize / 2;
@@ -125,9 +125,9 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler&                            cgh,
 
                 /* We don't need q, but using float4 in shmem avoids bank conflicts.
                    (but it also wastes L2 bandwidth). */
-                const float4 xq    = a_xq[ai];
-                const float3 shift = a_shiftVec[nbSci.shift];
-                const float4 xi(xq[0] + shift[0], xq[1] + shift[1], xq[2] + shift[2], xq[3]);
+                const Float4 xq    = a_xq[ai];
+                const Float3 shift = a_shiftVec[nbSci.shift];
+                const Float4 xi(xq[0] + shift[0], xq[1] + shift[1], xq[2] + shift[2], xq[3]);
                 sm_xq[tidxj + i][tidxi] = xi;
             }
         }
@@ -170,17 +170,17 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler&                            cgh,
                         const int aj = cj * c_clSize + tidxj;
 
                         /* load j atom data */
-                        const float4 tmp = a_xq[aj];
-                        const float3 xj(tmp[0], tmp[1], tmp[2]);
+                        const Float4 tmp = a_xq[aj];
+                        const Float3 xj(tmp[0], tmp[1], tmp[2]);
 
                         for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
                         {
                             if (imaskCheck & mask_ji)
                             {
                                 // load i-cluster coordinates from shmem
-                                const float4 xi = sm_xq[i][tidxi];
+                                const Float4 xi = sm_xq[i][tidxi];
                                 // distance between i and j atoms
-                                float3 rv(xi[0], xi[1], xi[2]);
+                                Float3 rv(xi[0], xi[1], xi[2]);
                                 rv -= xj;
                                 const float r2 = norm2(rv);
 
index 4e5e328f5658118a1803d9525f4280a5b93e012b..ba14d9b86740813616545e01bdbfdc92c626e4c6 100644 (file)
@@ -68,7 +68,7 @@ struct nb_staging_t
     //! electrostatic energy
     float* e_el = nullptr;
     //! shift forces
-    float3* fshift = nullptr;
+    Float3* fshift = nullptr;
 };
 
 /** \internal
@@ -84,9 +84,9 @@ struct sycl_atomdata_t
     int numAlloc;
 
     //! atom coordinates + charges, size \ref natoms
-    DeviceBuffer<float4> xq;
+    DeviceBuffer<Float4> xq;
     //! force output array, size \ref natoms
-    DeviceBuffer<float3> f;
+    DeviceBuffer<Float3> f;
 
     //! LJ energy output, size 1
     DeviceBuffer<float> eLJ;
@@ -94,17 +94,17 @@ struct sycl_atomdata_t
     DeviceBuffer<float> eElec;
 
     //! shift forces
-    DeviceBuffer<float3> fShift;
+    DeviceBuffer<Float3> fShift;
 
     //! number of atom types
     int numTypes;
     //! atom type indices, size \ref natoms
     DeviceBuffer<int> atomTypes;
     //! sqrt(c6),sqrt(c12) size \ref natoms
-    DeviceBuffer<float2> ljComb;
+    DeviceBuffer<Float2> ljComb;
 
     //! shifts
-    DeviceBuffer<float3> shiftVec;
+    DeviceBuffer<Float3> shiftVec;
     //! true if the shift vector has been uploaded
     bool shiftVecUploaded;
 };