From: Andrey Alekseenko Date: Thu, 18 Feb 2021 10:09:14 +0000 (+0300) Subject: Rename our SYCL aliases floatN to FloatN X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=c39bed54b02de1f1622bca5d404737186bdad427;p=alexxy%2Fgromacs.git Rename our SYCL aliases floatN to FloatN Main reasons: - Clearly separate our types (or aliases) from native types. This will likely make later changes (e.g., in scope of #3312) easier. - Enable hipSYCL build by avoiding clashes of multiple floatN in the global namespace. Refs #3312, #3923 --- diff --git a/src/gromacs/gpu_utils/devicebuffer_sycl.h b/src/gromacs/gpu_utils/devicebuffer_sycl.h index 31898e00eb..222f08c20b 100644 --- a/src/gromacs/gpu_utils/devicebuffer_sycl.h +++ b/src/gromacs/gpu_utils/devicebuffer_sycl.h @@ -383,7 +383,7 @@ cl::sycl::event fillSyclBufferWithNull(cl::sycl::buffer& buffer, //! \brief Helper function to clear device buffer of type float3. template<> -inline cl::sycl::event fillSyclBufferWithNull(cl::sycl::buffer& buffer, +inline cl::sycl::event fillSyclBufferWithNull(cl::sycl::buffer& buffer, size_t startingOffset, size_t numValues, cl::sycl::queue queue) diff --git a/src/gromacs/gpu_utils/gputraits_sycl.h b/src/gromacs/gpu_utils/gputraits_sycl.h index d53552e3c2..9c64d3f303 100644 --- a/src/gromacs/gpu_utils/gputraits_sycl.h +++ b/src/gromacs/gpu_utils/gputraits_sycl.h @@ -54,14 +54,13 @@ using DeviceTexture = void*; //! \brief Single GPU call timing event, not used with SYCL using CommandEvent = void*; +// TODO: Issue #3312 //! Convenience alias. -using float4 = cl::sycl::float4; - +using Float4 = cl::sycl::float4; //! Convenience alias. Not using cl::sycl::float3 due to alignment issues. -using float3 = gmx::RVec; - +using Float3 = gmx::RVec; //! Convenience alias for cl::sycl::float2 -using float2 = cl::sycl::float2; +using Float2 = cl::sycl::float2; /*! \internal \brief * GPU kernels scheduling description. This is same in OpenCL/CUDA. diff --git a/src/gromacs/mdlib/leapfrog_gpu.h b/src/gromacs/mdlib/leapfrog_gpu.h index 108259f1c6..d7c77ff756 100644 --- a/src/gromacs/mdlib/leapfrog_gpu.h +++ b/src/gromacs/mdlib/leapfrog_gpu.h @@ -53,6 +53,7 @@ #if GMX_GPU_SYCL # include "gromacs/gpu_utils/devicebuffer_sycl.h" # include "gromacs/gpu_utils/gputraits_sycl.h" +using float3 = Float3; #endif #include diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp index 1e8e0f34b3..c313deb110 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp @@ -111,9 +111,9 @@ void gpu_launch_cpyback(NbnxmGpu* nb, */ if (!stepWork.useGpuFBufferOps) { - GMX_ASSERT(adat->f.elementSize() == sizeof(float3), + GMX_ASSERT(adat->f.elementSize() == sizeof(Float3), "The size of the force buffer element should be equal to the size of float3."); - copyFromDeviceBuffer(reinterpret_cast(nbatom->out[0].f.data()) + adatBegin, + copyFromDeviceBuffer(reinterpret_cast(nbatom->out[0].f.data()) + adatBegin, &adat->f, adatBegin, adatLen, @@ -197,10 +197,10 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom getGpuAtomRange(adat, atomLocality, &adatBegin, &adatLen); /* HtoD x, q */ - GMX_ASSERT(adat->xq.elementSize() == sizeof(float4), + GMX_ASSERT(adat->xq.elementSize() == sizeof(Float4), "The size of the xyzq buffer element should be equal to the size of float4."); copyToDeviceBuffer(&adat->xq, - reinterpret_cast(nbatom->x().data()) + adatBegin, + reinterpret_cast(nbatom->x().data()) + adatBegin, adatBegin, adatLen, deviceStream, diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp index ec538a1fc3..a3d8626fc4 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp @@ -204,7 +204,7 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) GMX_ASSERT(adat->shiftVec.elementSize() == sizeof(nbatom->shift_vec[0]), "Sizes of host- and device-side shift vectors should be the same."); copyToDeviceBuffer(&adat->shiftVec, - reinterpret_cast(nbatom->shift_vec.data()), + reinterpret_cast(nbatom->shift_vec.data()), 0, SHIFTS, localStream, @@ -262,10 +262,10 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) if (useLjCombRule(nb->nbparam->vdwType)) { - GMX_ASSERT(atdat->ljComb.elementSize() == sizeof(float2), + GMX_ASSERT(atdat->ljComb.elementSize() == sizeof(Float2), "Size of the LJ parameters element should be equal to the size of float2."); copyToDeviceBuffer(&atdat->ljComb, - reinterpret_cast(nbat->params().lj_comb.data()), + reinterpret_cast(nbat->params().lj_comb.data()), 0, numAtoms, localStream, diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp index a68c9d8b2f..946eb2dd4d 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp @@ -301,7 +301,7 @@ static inline float interpolateCoulombForceR(const DeviceAccessor itemIdx, const int tidxi, const int aidx, @@ -342,7 +342,7 @@ static inline void reduceForceJShuffle(float3 f * This implementation works only with power of two array sizes. */ static inline void reduceForceIAndFShift(cl::sycl::accessor sm_buf, - const float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster], + const Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster], const bool calcFShift, const cl::sycl::nd_item<1> itemIdx, const int tidxi, @@ -418,16 +418,16 @@ static inline void reduceForceIAndFShift(cl::sycl::accessor auto nbnxmKernel(cl::sycl::handler& cgh, - DeviceAccessor a_xq, + DeviceAccessor a_xq, DeviceAccessor a_f, - DeviceAccessor a_shiftVec, + DeviceAccessor a_shiftVec, DeviceAccessor a_fShift, OptionalAccessor a_energyElec, OptionalAccessor a_energyVdw, DeviceAccessor a_plistCJ4, DeviceAccessor a_plistSci, DeviceAccessor a_plistExcl, - OptionalAccessor> a_ljComb, + OptionalAccessor> a_ljComb, OptionalAccessor> a_atomTypes, OptionalAccessor> a_nbfp, OptionalAccessor> a_nbfpComb, @@ -483,7 +483,7 @@ auto nbnxmKernel(cl::sycl::handler& cgh, } // shmem buffer for i x+q pre-loading - cl::sycl::accessor sm_xq( + cl::sycl::accessor sm_xq( cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh); // shmem buffer for force reduction @@ -506,7 +506,7 @@ auto nbnxmKernel(cl::sycl::handler& cgh, auto sm_ljCombI = [&]() { if constexpr (props.vdwComb) { - return cl::sycl::accessor( + return cl::sycl::accessor( cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh); } else @@ -539,10 +539,10 @@ auto nbnxmKernel(cl::sycl::handler& cgh, // Better use sg.get_group_range, but too much of the logic relies on it anyway const unsigned widx = tidx / subGroupSize; - float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster]; // i force buffer + Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster]; // i force buffer for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) { - fCiBuf[i] = float3(0.0F, 0.0F, 0.0F); + fCiBuf[i] = Float3(0.0F, 0.0F, 0.0F); } const nbnxn_sci_t nbSci = a_plistSci[bidx]; @@ -561,9 +561,9 @@ auto nbnxmKernel(cl::sycl::handler& cgh, const int ai = ci * c_clSize + tidxi; const cl::sycl::id<2> cacheIdx = cl::sycl::id<2>(tidxj + i, tidxi); - const float3 shift = a_shiftVec[nbSci.shift]; - float4 xqi = a_xq[ai]; - xqi += float4(shift[0], shift[1], shift[2], 0.0F); + const Float3 shift = a_shiftVec[nbSci.shift]; + Float4 xqi = a_xq[ai]; + xqi += Float4(shift[0], shift[1], shift[2], 0.0F); xqi[3] *= epsFac; sm_xq[cacheIdx] = xqi; @@ -659,12 +659,12 @@ auto nbnxmKernel(cl::sycl::handler& cgh, const int aj = cj * c_clSize + tidxj; // load j atom data - const float4 xqj = a_xq[aj]; + const Float4 xqj = a_xq[aj]; - const float3 xj(xqj[0], xqj[1], xqj[2]); + const Float3 xj(xqj[0], xqj[1], xqj[2]); const float qj = xqj[3]; int atomTypeJ; // Only needed if (!props.vdwComb) - float2 ljCombJ; // Only needed if (props.vdwComb) + Float2 ljCombJ; // Only needed if (props.vdwComb) if constexpr (props.vdwComb) { ljCombJ = a_ljComb[aj]; @@ -674,7 +674,7 @@ auto nbnxmKernel(cl::sycl::handler& cgh, atomTypeJ = a_atomTypes[aj]; } - float3 fCjBuf(0.0F, 0.0F, 0.0F); + Float3 fCjBuf(0.0F, 0.0F, 0.0F); for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) { @@ -683,11 +683,11 @@ auto nbnxmKernel(cl::sycl::handler& cgh, // i cluster index const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + i; // all threads load an atom from i cluster ci into shmem! - const float4 xqi = sm_xq[i][tidxi]; - const float3 xi(xqi[0], xqi[1], xqi[2]); + const Float4 xqi = sm_xq[i][tidxi]; + const Float3 xi(xqi[0], xqi[1], xqi[2]); // distance between i and j atoms - const float3 rv = xi - xj; + const Float3 rv = xi - xj; float r2 = norm2(rv); if constexpr (doPruneNBL) @@ -724,7 +724,7 @@ auto nbnxmKernel(cl::sycl::handler& cgh, } else { - const float2 ljCombI = sm_ljCombI[i][tidxi]; + const Float2 ljCombI = sm_ljCombI[i][tidxi]; if constexpr (props.vdwCombGeom) { c6 = ljCombI[0] * ljCombJ[0]; @@ -867,7 +867,7 @@ auto nbnxmKernel(cl::sycl::handler& cgh, } } - const float3 forceIJ = rv * fInvR; + const Float3 forceIJ = rv * fInvR; /* accumulate j forces in registers */ fCjBuf -= forceIJ; @@ -969,9 +969,9 @@ void launchNbnxmKernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; // Casting to float simplifies using atomic ops in the kernel - cl::sycl::buffer f(*adat->f.buffer_); + cl::sycl::buffer f(*adat->f.buffer_); auto fAsFloat = f.reinterpret(f.get_count() * DIM); - cl::sycl::buffer fShift(*adat->fShift.buffer_); + cl::sycl::buffer fShift(*adat->fShift.buffer_); auto fShiftAsFloat = fShift.reinterpret(fShift.get_count() * DIM); cl::sycl::event e = chooseAndLaunchNbnxmKernel(doPruneNBL, diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp index cb88eb31d6..cf9ce2f67d 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp @@ -62,8 +62,8 @@ namespace Nbnxm */ template auto nbnxmKernelPruneOnly(cl::sycl::handler& cgh, - DeviceAccessor a_xq, - DeviceAccessor a_shiftVec, + DeviceAccessor a_xq, + DeviceAccessor a_shiftVec, DeviceAccessor a_plistCJ4, DeviceAccessor a_plistSci, DeviceAccessor a_plistIMask, @@ -79,7 +79,7 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler& cgh, cgh.require(a_plistIMask); /* shmem buffer for i x+q pre-loading */ - cl::sycl::accessor sm_xq( + cl::sycl::accessor sm_xq( cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh); constexpr int warpSize = c_clSize * c_clSize / 2; @@ -125,9 +125,9 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler& cgh, /* We don't need q, but using float4 in shmem avoids bank conflicts. (but it also wastes L2 bandwidth). */ - const float4 xq = a_xq[ai]; - const float3 shift = a_shiftVec[nbSci.shift]; - const float4 xi(xq[0] + shift[0], xq[1] + shift[1], xq[2] + shift[2], xq[3]); + const Float4 xq = a_xq[ai]; + const Float3 shift = a_shiftVec[nbSci.shift]; + const Float4 xi(xq[0] + shift[0], xq[1] + shift[1], xq[2] + shift[2], xq[3]); sm_xq[tidxj + i][tidxi] = xi; } } @@ -170,17 +170,17 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler& cgh, const int aj = cj * c_clSize + tidxj; /* load j atom data */ - const float4 tmp = a_xq[aj]; - const float3 xj(tmp[0], tmp[1], tmp[2]); + const Float4 tmp = a_xq[aj]; + const Float3 xj(tmp[0], tmp[1], tmp[2]); for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++) { if (imaskCheck & mask_ji) { // load i-cluster coordinates from shmem - const float4 xi = sm_xq[i][tidxi]; + const Float4 xi = sm_xq[i][tidxi]; // distance between i and j atoms - float3 rv(xi[0], xi[1], xi[2]); + Float3 rv(xi[0], xi[1], xi[2]); rv -= xj; const float r2 = norm2(rv); diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h index 4e5e328f56..ba14d9b867 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h @@ -68,7 +68,7 @@ struct nb_staging_t //! electrostatic energy float* e_el = nullptr; //! shift forces - float3* fshift = nullptr; + Float3* fshift = nullptr; }; /** \internal @@ -84,9 +84,9 @@ struct sycl_atomdata_t int numAlloc; //! atom coordinates + charges, size \ref natoms - DeviceBuffer xq; + DeviceBuffer xq; //! force output array, size \ref natoms - DeviceBuffer f; + DeviceBuffer f; //! LJ energy output, size 1 DeviceBuffer eLJ; @@ -94,17 +94,17 @@ struct sycl_atomdata_t DeviceBuffer eElec; //! shift forces - DeviceBuffer fShift; + DeviceBuffer fShift; //! number of atom types int numTypes; //! atom type indices, size \ref natoms DeviceBuffer atomTypes; //! sqrt(c6),sqrt(c12) size \ref natoms - DeviceBuffer ljComb; + DeviceBuffer ljComb; //! shifts - DeviceBuffer shiftVec; + DeviceBuffer shiftVec; //! true if the shift vector has been uploaded bool shiftVecUploaded; };