From: Artem Zhmurov Date: Mon, 22 Feb 2021 09:43:43 +0000 (+0000) Subject: Add FloatN aliases to OpenCL and use them in NBNXM X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=c66827166fc9099ecd1a4a2f7080558df70bf529;p=alexxy%2Fgromacs.git Add FloatN aliases to OpenCL and use them in NBNXM These aliases are nessesary to unify OpenCL, CUDA and SYCL code. Refs #3312, #2608, #3311 --- diff --git a/src/gromacs/gpu_utils/gputraits_ocl.h b/src/gromacs/gpu_utils/gputraits_ocl.h index 489bb0527c..a8a3c26818 100644 --- a/src/gromacs/gpu_utils/gputraits_ocl.h +++ b/src/gromacs/gpu_utils/gputraits_ocl.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -46,12 +46,22 @@ */ #include "gromacs/gpu_utils/gmxopencl.h" +#include "gromacs/math/vectypes.h" using DeviceTexture = void*; //! \brief Single GPU call timing event using CommandEvent = cl_event; +//! Convenience alias for 2-wide float +using Float2 = cl_float2; + +//! Convenience alias for 3-wide float. Not using cl_float3 due to alignment issues. +using Float3 = gmx::RVec; + +//! Convenience alias for 4-wide float. +using Float4 = cl_float4; + /*! \internal \brief * GPU kernels scheduling description. This is same in OpenCL/CUDA. * Provides reasonable defaults, one typically only needs to set the GPU stream diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index 7e4eeca484..6cdad01019 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -577,12 +577,12 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom } /* HtoD x, q */ - GMX_ASSERT(sizeof(float) == sizeof(*nbatom->x().data()), - "The size of the xyzq buffer element should be equal to the size of float4."); + static_assert(sizeof(float) == sizeof(*nbatom->x().data()), + "The size of the xyzq buffer element should be equal to the size of float4."); copyToDeviceBuffer(&adat->xq, - nbatom->x().data() + adat_begin * 4, - adat_begin * 4, - adat_len * 4, + reinterpret_cast(nbatom->x().data()) + adat_begin, + adat_begin, + adat_len, deviceStream, GpuApiCallBehavior::Async, bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr); @@ -984,10 +984,10 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* DtoH f */ GMX_ASSERT(sizeof(*nbatom->out[0].f.data()) == sizeof(float), "The host force buffer should be in single precision to match device data size."); - copyFromDeviceBuffer(&nbatom->out[0].f[adat_begin * DIM], + copyFromDeviceBuffer(reinterpret_cast(nbatom->out[0].f.data()) + adat_begin, &adat->f, - adat_begin * DIM, - adat_len * DIM, + adat_begin, + adat_len, deviceStream, GpuApiCallBehavior::Async, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); @@ -1012,12 +1012,13 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* DtoH fshift when virial is needed */ if (stepWork.computeVirial) { - GMX_ASSERT(sizeof(*nb->nbst.fshift) == DIM * sizeof(float), - "Sizes of host- and device-side shift vector elements should be the same."); - copyFromDeviceBuffer(reinterpret_cast(nb->nbst.fshift), + static_assert( + sizeof(*nb->nbst.fshift) == sizeof(Float3), + "Sizes of host- and device-side shift vector elements should be the same."); + copyFromDeviceBuffer(nb->nbst.fshift, &adat->fshift, 0, - SHIFTS * DIM, + SHIFTS, deviceStream, GpuApiCallBehavior::Async, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); @@ -1026,8 +1027,8 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* DtoH energies */ if (stepWork.computeEnergy) { - GMX_ASSERT(sizeof(*nb->nbst.e_lj) == sizeof(float), - "Sizes of host- and device-side LJ energy terms should be the same."); + static_assert(sizeof(*nb->nbst.e_lj) == sizeof(float), + "Sizes of host- and device-side LJ energy terms should be the same."); copyFromDeviceBuffer(nb->nbst.e_lj, &adat->e_lj, 0, @@ -1035,9 +1036,9 @@ void gpu_launch_cpyback(NbnxmGpu* nb, deviceStream, GpuApiCallBehavior::Async, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); - GMX_ASSERT(sizeof(*nb->nbst.e_el) == sizeof(float), - "Sizes of host- and device-side electrostatic energy terms should be the " - "same."); + static_assert(sizeof(*nb->nbst.e_el) == sizeof(float), + "Sizes of host- and device-side electrostatic energy terms should be the " + "same."); copyFromDeviceBuffer(nb->nbst.e_el, &adat->e_el, 0, diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp index 7472b37c70..706c3a48d5 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp @@ -375,7 +375,7 @@ static void nbnxn_ocl_clear_f(NbnxmGpu* nb, int natoms_clear) cl_atomdata_t* atomData = nb->atdat; const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; - clearDeviceBufferAsync(&atomData->f, 0, natoms_clear * DIM, localStream); + clearDeviceBufferAsync(&atomData->f, 0, natoms_clear, localStream); } //! This function is documented in the header file @@ -404,12 +404,12 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) /* only if we have a dynamic box */ if (nbatom->bDynamicBox || !adat->bShiftVecUploaded) { - GMX_ASSERT(sizeof(float) * DIM == sizeof(*nbatom->shift_vec.data()), - "Sizes of host- and device-side shift vectors should be the same."); + static_assert(sizeof(Float3) == sizeof(nbatom->shift_vec[0]), + "Sizes of host- and device-side shift vectors should be the same."); copyToDeviceBuffer(&adat->shift_vec, - reinterpret_cast(nbatom->shift_vec.data()), + reinterpret_cast(nbatom->shift_vec.data()), 0, - SHIFTS * DIM, + SHIFTS, localStream, GpuApiCallBehavior::Async, nullptr); @@ -454,13 +454,13 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) } - allocateDeviceBuffer(&d_atdat->f, nalloc * DIM, deviceContext); - allocateDeviceBuffer(&d_atdat->xq, nalloc * (DIM + 1), deviceContext); + allocateDeviceBuffer(&d_atdat->f, nalloc, deviceContext); + allocateDeviceBuffer(&d_atdat->xq, nalloc, deviceContext); if (useLjCombRule(nb->nbparam->vdwType)) { // Two Lennard-Jones parameters per atom - allocateDeviceBuffer(&d_atdat->lj_comb, nalloc * 2, deviceContext); + allocateDeviceBuffer(&d_atdat->lj_comb, nalloc, deviceContext); } else { @@ -482,20 +482,20 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) if (useLjCombRule(nb->nbparam->vdwType)) { - GMX_ASSERT(sizeof(float) == sizeof(*nbat->params().lj_comb.data()), - "Size of the LJ parameters element should be equal to the size of float2."); + static_assert(sizeof(float) == sizeof(*nbat->params().lj_comb.data()), + "Size of the LJ parameters element should be equal to the size of float2."); copyToDeviceBuffer(&d_atdat->lj_comb, - nbat->params().lj_comb.data(), + reinterpret_cast(nbat->params().lj_comb.data()), 0, - 2 * natoms, + natoms, localStream, GpuApiCallBehavior::Async, bDoTime ? timers->atdat.fetchNextEvent() : nullptr); } else { - GMX_ASSERT(sizeof(int) == sizeof(*nbat->params().type.data()), - "Sizes of host- and device-side atom types should be the same."); + static_assert(sizeof(int) == sizeof(*nbat->params().type.data()), + "Sizes of host- and device-side atom types should be the same."); copyToDeviceBuffer(&d_atdat->atom_types, nbat->params().type.data(), 0, diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h index 013345b32c..751e352962 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h @@ -87,7 +87,7 @@ struct nb_staging_t //! electrostatic energy float* e_el = nullptr; //! float3 buffer with shift forces - float (*fshift)[3] = nullptr; + Float3* fshift = nullptr; }; /*! \internal @@ -103,10 +103,10 @@ typedef struct cl_atomdata int nalloc; //! float4 buffer with atom coordinates + charges, size natoms - DeviceBuffer xq; + DeviceBuffer xq; //! float3 buffer with force output array, size natoms - DeviceBuffer f; + DeviceBuffer f; //! LJ energy output, size 1 DeviceBuffer e_lj; @@ -114,17 +114,17 @@ typedef struct cl_atomdata DeviceBuffer e_el; //! float3 buffer with shift forces - DeviceBuffer fshift; + DeviceBuffer fshift; //! number of atom types int ntypes; //! int buffer with atom type indices, size natoms DeviceBuffer atom_types; //! float2 buffer with sqrt(c6),sqrt(c12), size natoms - DeviceBuffer lj_comb; + DeviceBuffer lj_comb; //! float3 buffer with shifts values - DeviceBuffer shift_vec; + DeviceBuffer shift_vec; //! true if the shift vector has been uploaded bool bShiftVecUploaded;