From: Artem Zhmurov Date: Sat, 20 Feb 2021 08:12:02 +0000 (+0300) Subject: Add FloatN aliases to CUDA and use them in NBNXM X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=a54b3a0d565000f519775d7d23f23bb07ce5bd7f;p=alexxy%2Fgromacs.git Add FloatN aliases to CUDA and use them in NBNXM These aliases are nessesary to unify OpenCL, CUDA and SYCL code. Refs #3312, #2608, #3311 --- diff --git a/src/gromacs/gpu_utils/gputraits.cuh b/src/gromacs/gpu_utils/gputraits.cuh index fec113b4b4..656e8a2319 100644 --- a/src/gromacs/gpu_utils/gputraits.cuh +++ b/src/gromacs/gpu_utils/gputraits.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -45,6 +45,7 @@ * \ingroup module_gpu_utils */ #include +#include "gromacs/math/vectypes.h" //! Device texture for fast read-only data fetching using DeviceTexture = cudaTextureObject_t; @@ -52,6 +53,15 @@ using DeviceTexture = cudaTextureObject_t; //! \brief Single GPU call timing event - meaningless in CUDA using CommandEvent = void; +//! Convenience alias for 2-wide float +using Float2 = float2; + +//! Convenience alias for 3-wide float +using Float3 = gmx::RVec; + +//! Convenience alias for 4-wide float. +using Float4 = float4; + /*! \internal \brief * GPU kernels scheduling description. This is same in OpenCL/CUDA. * Provides reasonable defaults, one typically only needs to set the GPU stream diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 6796da5aac..62d50f039d 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -496,17 +496,17 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom adat_len = adat->natoms - adat->natoms_local; } - /* HtoD x, q */ /* beginning of timed HtoD section */ if (bDoTime) { t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream); } - static_assert(sizeof(adat->xq[0]) == sizeof(float4), + /* HtoD x, q */ + static_assert(sizeof(adat->xq[0]) == sizeof(Float4), "The size of the xyzq buffer element should be equal to the size of float4."); copyToDeviceBuffer(&adat->xq, - reinterpret_cast(nbatom->x().data()) + adat_begin, + reinterpret_cast(nbatom->x().data()) + adat_begin, adat_begin, adat_len, deviceStream, @@ -845,9 +845,9 @@ void gpu_launch_cpyback(NbnxmGpu* nb, if (!stepWork.useGpuFBufferOps) { static_assert( - sizeof(adat->f[0]) == sizeof(float3), + sizeof(adat->f[0]) == sizeof(Float3), "The size of the force buffer element should be equal to the size of float3."); - copyFromDeviceBuffer(reinterpret_cast(nbatom->out[0].f.data()) + adat_begin, + copyFromDeviceBuffer(reinterpret_cast(nbatom->out[0].f.data()) + adat_begin, &adat->f, adat_begin, adat_len, diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index 7d1334144f..dff7e4d21c 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -268,7 +268,7 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) static_assert(sizeof(adat->shift_vec[0]) == sizeof(nbatom->shift_vec[0]), "Sizes of host- and device-side shift vectors should be the same."); copyToDeviceBuffer(&adat->shift_vec, - reinterpret_cast(nbatom->shift_vec.data()), + reinterpret_cast(nbatom->shift_vec.data()), 0, SHIFTS, localStream, @@ -368,10 +368,10 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) if (useLjCombRule(nb->nbparam->vdwType)) { - static_assert(sizeof(d_atdat->lj_comb[0]) == sizeof(float2), + static_assert(sizeof(d_atdat->lj_comb[0]) == sizeof(Float2), "Size of the LJ parameters element should be equal to the size of float2."); copyToDeviceBuffer(&d_atdat->lj_comb, - reinterpret_cast(nbat->params().lj_comb.data()), + reinterpret_cast(nbat->params().lj_comb.data()), 0, natoms, localStream, diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh index 0ff57b25b1..688e094715 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh @@ -48,6 +48,7 @@ #include "gromacs/gpu_utils/cuda_arch_utils.cuh" #include "gromacs/gpu_utils/cuda_kernel_utils.cuh" +#include "gromacs/gpu_utils/typecasts.cuh" #include "gromacs/math/utilities.h" #include "gromacs/pbcutil/ishift.h" /* Note that floating-point constants in CUDA code should be suffixed @@ -178,8 +179,8 @@ __launch_bounds__(THREADS_PER_BLOCK) float2 ljcp_i, ljcp_j; # endif const float4* xq = atdat.xq; - float3* f = atdat.f; - const float3* shift_vec = atdat.shift_vec; + float3* f = asFloat3(atdat.f); + const float3* shift_vec = asFloat3(atdat.shift_vec); float rcoulomb_sq = nbparam.rcoulomb_sq; # ifdef VDW_CUTOFF_CHECK float rvdw_sq = nbparam.rvdw_sq; @@ -648,7 +649,8 @@ __launch_bounds__(THREADS_PER_BLOCK) /* add up local shift forces into global mem, tidxj indexes x,y,z */ if (bCalcFshift && (tidxj & 3) < 3) { - atomicAdd(&(atdat.fshift[nb_sci.shift].x) + (tidxj & 3), fshift_buf); + float3* fshift = asFloat3(atdat.fshift); + atomicAdd(&(fshift[nb_sci.shift].x) + (tidxj & 3), fshift_buf); } # ifdef CALC_ENERGIES diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh index 563e1edc0c..11c51227f4 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -47,6 +47,7 @@ #include "gmxpre.h" #include "gromacs/gpu_utils/cuda_arch_utils.cuh" +#include "gromacs/gpu_utils/typecasts.cuh" #include "gromacs/math/utilities.h" #include "gromacs/pbcutil/ishift.h" @@ -124,7 +125,7 @@ nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const Nbnx const nbnxn_sci_t* pl_sci = plist.sci; nbnxn_cj4_t* pl_cj4 = plist.cj4; const float4* xq = atdat.xq; - const float3* shift_vec = atdat.shift_vec; + const float3* shift_vec = asFloat3(atdat.shift_vec); float rlistOuter_sq = nbparam.rlistOuter_sq; float rlistInner_sq = nbparam.rlistInner_sq; diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index 65a247ad08..f1b1a6db81 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -84,7 +84,7 @@ struct nb_staging_t //! electrostatic energy float* e_el = nullptr; //! shift forces - float3* fshift = nullptr; + Float3* fshift = nullptr; }; /** \internal @@ -100,9 +100,9 @@ struct cu_atomdata int nalloc; //! atom coordinates + charges, size natoms - DeviceBuffer xq; + DeviceBuffer xq; //! force output array, size natoms - DeviceBuffer f; + DeviceBuffer f; //! LJ energy output, size 1 DeviceBuffer e_lj; @@ -110,17 +110,17 @@ struct cu_atomdata DeviceBuffer e_el; //! shift forces - DeviceBuffer fshift; + DeviceBuffer fshift; //! number of atom types int ntypes; //! atom type indices, size natoms DeviceBuffer atom_types; //! sqrt(c6),sqrt(c12) size natoms - DeviceBuffer lj_comb; + DeviceBuffer lj_comb; //! shifts - DeviceBuffer shift_vec; + DeviceBuffer shift_vec; //! true if the shift vector has been uploaded bool bShiftVecUploaded; };