From 03a138de53c34f55fca2c3f998f0a85d520f3b0a Mon Sep 17 00:00:00 2001 From: Andrey Alekseenko Date: Thu, 11 Mar 2021 13:48:06 +0300 Subject: [PATCH] Use DeviceBuffer in GPU update and NBNXM code MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit ... instead of raw device pointers and DeviceBuffer. We try to use DeviceBuffer, but in some places we have to use DeviceVector, until we can define FloatN types without including any backend-specific headers. Currently, Float3 is defined as gmx::RVec, so this should not cause any issues. Also added some helper functions to convert RVec ̌<-> Float3 <-> float3. Preparation for #3932 and #3941. --- src/gromacs/gpu_utils/gputraits.h | 52 ++++++++++++++++++- src/gromacs/gpu_utils/typecasts.cuh | 23 +++++++- src/gromacs/mdlib/gpuforcereduction.h | 2 +- src/gromacs/mdlib/gpuforcereduction_impl.cpp | 2 +- src/gromacs/mdlib/gpuforcereduction_impl.cu | 4 +- src/gromacs/mdlib/gpuforcereduction_impl.cuh | 5 +- src/gromacs/mdlib/leapfrog_gpu.cu | 27 +++++----- src/gromacs/mdlib/leapfrog_gpu.h | 16 +++--- src/gromacs/mdlib/leapfrog_gpu_sycl.cpp | 34 ++++++------ src/gromacs/mdlib/lincs_gpu.cu | 28 ++++++---- src/gromacs/mdlib/lincs_gpu.cuh | 17 +++--- src/gromacs/mdlib/settle_gpu.cu | 32 ++++++++---- src/gromacs/mdlib/settle_gpu.cuh | 21 ++++---- src/gromacs/mdlib/tests/constrtestrunners.cu | 29 ++++++----- .../mdlib/tests/leapfrogtestrunners_gpu.cpp | 13 +++-- src/gromacs/mdlib/tests/settletestrunners.cu | 22 ++++---- .../mdlib/update_constrain_gpu_impl.cu | 18 +++---- src/gromacs/mdlib/update_constrain_gpu_impl.h | 16 +++--- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 2 +- src/gromacs/nbnxm/nbnxm.cpp | 2 +- src/gromacs/nbnxm/nbnxm.h | 2 +- src/gromacs/nbnxm/nbnxm_gpu.h | 3 +- 22 files changed, 230 insertions(+), 140 deletions(-) diff --git a/src/gromacs/gpu_utils/gputraits.h b/src/gromacs/gpu_utils/gputraits.h index 38c5edf8a9..344b0427c1 100644 --- a/src/gromacs/gpu_utils/gputraits.h +++ b/src/gromacs/gpu_utils/gputraits.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -66,6 +66,56 @@ using DeviceTexture = void*; //! \brief Single GPU call timing event using CommandEvent = void*; +// Stubs for CPU-only build. Might be changed in #3312. +struct Float2 +{ +}; +struct Float3 +{ +}; +struct Float4 +{ +}; + #endif // GMX_GPU +namespace gmx +{ +template +static inline Float3* asGenericFloat3Pointer(T* in) +{ + static_assert(sizeof(T) == sizeof(Float3), + "Size of the host-side data-type is different from the size of the generic " + "device-side counterpart."); + return reinterpret_cast(in); +} + +template +static inline const Float3* asGenericFloat3Pointer(const T* in) +{ + static_assert(sizeof(T) == sizeof(Float3), + "Size of the host-side data-type is different from the size of the generic " + "device-side counterpart."); + return reinterpret_cast(in); +} + +template +static inline Float3* asGenericFloat3Pointer(C& in) +{ + static_assert(sizeof(*in.data()) == sizeof(Float3), + "Size of the host-side data-type is different from the size of the device-side " + "counterpart."); + return reinterpret_cast(in.data()); +} + +template +static inline const Float3* asGenericFloat3Pointer(const C& in) +{ + static_assert(sizeof(*in.data()) == sizeof(Float3), + "Size of the host-side data-type is different from the size of the device-side " + "counterpart."); + return reinterpret_cast(in.data()); +} +} // namespace gmx + #endif // GMX_GPU_UTILS_GPUTRAITS_H diff --git a/src/gromacs/gpu_utils/typecasts.cuh b/src/gromacs/gpu_utils/typecasts.cuh index 1dd63b7193..d98c587703 100644 --- a/src/gromacs/gpu_utils/typecasts.cuh +++ b/src/gromacs/gpu_utils/typecasts.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2020, by the GROMACS development team, led by + * Copyright (c) 2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -60,4 +60,25 @@ static inline __host__ __device__ float3* asFloat3(gmx::RVec* in) return reinterpret_cast(in); } +/*! \brief Cast pointer RVec buffer to a pointer to float3 buffer. + * + * \param[in] in The Pointer to RVec buffer to cast. + * + * \returns Buffer pointer, casted to float3*. + */ +static inline __host__ __device__ float3** asFloat3Pointer(gmx::RVec** in) +{ + static_assert(sizeof((*in)[0]) == sizeof(float3), + "Size of the host-side data-type is different from the size of the device-side " + "counterpart."); + return reinterpret_cast(in); +} +static inline __host__ __device__ const float3* const* asFloat3Pointer(const gmx::RVec* const* in) +{ + static_assert(sizeof((*in)[0]) == sizeof(float3), + "Size of the host-side data-type is different from the size of the device-side " + "counterpart."); + return reinterpret_cast(in); +} + #endif // GMX_GPU_UTILS_TYPECASTS_CUH diff --git a/src/gromacs/mdlib/gpuforcereduction.h b/src/gromacs/mdlib/gpuforcereduction.h index 157c4c7eca..b23df660ed 100644 --- a/src/gromacs/mdlib/gpuforcereduction.h +++ b/src/gromacs/mdlib/gpuforcereduction.h @@ -86,7 +86,7 @@ public: * * \param [in] forcePtr Pointer to force to be reduced */ - void registerNbnxmForce(void* forcePtr); + void registerNbnxmForce(DeviceBuffer forcePtr); /*! \brief Register a rvec-format force to be reduced * diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl.cpp index 1e0a30b2b6..b431fbad49 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl.cpp +++ b/src/gromacs/mdlib/gpuforcereduction_impl.cpp @@ -76,7 +76,7 @@ void GpuForceReduction::reinit(DeviceBuffer /*baseForcePtr*/, } // NOLINTNEXTLINE readability-convert-member-functions-to-static -void GpuForceReduction::registerNbnxmForce(void* /* forcePtr */) +void GpuForceReduction::registerNbnxmForce(DeviceBuffer /* forcePtr */) { GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation."); } diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cu b/src/gromacs/mdlib/gpuforcereduction_impl.cu index 6e1e7e920a..dab7d4da0c 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl.cu +++ b/src/gromacs/mdlib/gpuforcereduction_impl.cu @@ -218,9 +218,9 @@ GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext, { } -void GpuForceReduction::registerNbnxmForce(void* forcePtr) +void GpuForceReduction::registerNbnxmForce(DeviceBuffer forcePtr) { - impl_->registerNbnxmForce(reinterpret_cast>(forcePtr)); + impl_->registerNbnxmForce(forcePtr); } void GpuForceReduction::registerRvecForce(void* forcePtr) diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cuh b/src/gromacs/mdlib/gpuforcereduction_impl.cuh index bd222e40a6..c7d9493c82 100644 --- a/src/gromacs/mdlib/gpuforcereduction_impl.cuh +++ b/src/gromacs/mdlib/gpuforcereduction_impl.cuh @@ -45,6 +45,7 @@ #include "gromacs/gpu_utils/device_stream.h" #include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gromacs/gpu_utils/gputraits.h" #include "gromacs/math/vectypes.h" #include "gpuforcereduction.h" @@ -82,13 +83,13 @@ public: * * \param [in] forcePtr Pointer to force to be reduced */ - void registerNbnxmForce(DeviceBuffer forcePtr); + void registerNbnxmForce(DeviceBuffer forcePtr); /*! \brief Register a rvec-format force to be reduced * * \param [in] forcePtr Pointer to force to be reduced */ - void registerRvecForce(DeviceBuffer forcePtr); + void registerRvecForce(DeviceBuffer forcePtr); /*! \brief Add a dependency for this force reduction * diff --git a/src/gromacs/mdlib/leapfrog_gpu.cu b/src/gromacs/mdlib/leapfrog_gpu.cu index 2f5b589870..75b1026cfc 100644 --- a/src/gromacs/mdlib/leapfrog_gpu.cu +++ b/src/gromacs/mdlib/leapfrog_gpu.cu @@ -57,6 +57,7 @@ #include "gromacs/gpu_utils/cudautils.cuh" #include "gromacs/gpu_utils/devicebuffer.h" +#include "gromacs/gpu_utils/typecasts.cuh" #include "gromacs/gpu_utils/vectype_ops.cuh" #include "gromacs/math/vec.h" #include "gromacs/mdtypes/group.h" @@ -237,10 +238,10 @@ inline auto selectLeapFrogKernelPtr(bool doTemperatureScaling, return kernelPtr; } -void LeapFrogGpu::integrate(const DeviceBuffer d_x, - DeviceBuffer d_xp, - DeviceBuffer d_v, - const DeviceBuffer d_f, +void LeapFrogGpu::integrate(DeviceBuffer d_x, + DeviceBuffer d_xp, + DeviceBuffer d_v, + const DeviceBuffer d_f, const real dt, const bool doTemperatureScaling, gmx::ArrayRef tcstat, @@ -283,20 +284,22 @@ void LeapFrogGpu::integrate(const DeviceBuffer d_x, "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported " "in GPU version of Leap-Frog integrator."); prVelocityScalingMatrixDiagonal_ = - make_float3(dtPressureCouple * prVelocityScalingMatrix[XX][XX], - dtPressureCouple * prVelocityScalingMatrix[YY][YY], - dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ]); + Float3{ dtPressureCouple * prVelocityScalingMatrix[XX][XX], + dtPressureCouple * prVelocityScalingMatrix[YY][YY], + dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ] }; } kernelPtr = selectLeapFrogKernelPtr(doTemperatureScaling, numTempScaleValues_, prVelocityScalingType); } + // Checking the buffer types against the kernel argument types + static_assert(sizeof(*d_inverseMasses_) == sizeof(float)); const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, kernelLaunchConfig_, &numAtoms_, - &d_x, - &d_xp, - &d_v, - &d_f, + asFloat3Pointer(&d_x), + asFloat3Pointer(&d_xp), + asFloat3Pointer(&d_v), + asFloat3Pointer(&d_f), &d_inverseMasses_, &dt, &d_lambdas_, @@ -345,7 +348,7 @@ void LeapFrogGpu::set(const int numAtoms, const real* inverseMasses, const unsig reallocateDeviceBuffer( &d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_); copyToDeviceBuffer( - &d_inverseMasses_, (float*)inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr); + &d_inverseMasses_, inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr); // Temperature scale group map only used if there are more then one group if (numTempScaleValues_ > 1) diff --git a/src/gromacs/mdlib/leapfrog_gpu.h b/src/gromacs/mdlib/leapfrog_gpu.h index d7c77ff756..738437aa0d 100644 --- a/src/gromacs/mdlib/leapfrog_gpu.h +++ b/src/gromacs/mdlib/leapfrog_gpu.h @@ -47,20 +47,16 @@ #include "config.h" #if GMX_GPU_CUDA -# include "gromacs/gpu_utils/devicebuffer.cuh" # include "gromacs/gpu_utils/gputraits.cuh" #endif #if GMX_GPU_SYCL -# include "gromacs/gpu_utils/devicebuffer_sycl.h" # include "gromacs/gpu_utils/gputraits_sycl.h" -using float3 = Float3; #endif #include +#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/gpu_utils/hostallocator.h" -#include "gromacs/pbcutil/pbc.h" -#include "gromacs/pbcutil/pbc_aiuc.h" #include "gromacs/utility/arrayref.h" class DeviceContext; @@ -125,10 +121,10 @@ public: * \param[in] dtPressureCouple Period between pressure coupling steps * \param[in] prVelocityScalingMatrix Parrinello-Rahman velocity scaling matrix */ - void integrate(const DeviceBuffer d_x, - DeviceBuffer d_xp, - DeviceBuffer d_v, - const DeviceBuffer d_f, + void integrate(DeviceBuffer d_x, + DeviceBuffer d_xp, + DeviceBuffer d_v, + const DeviceBuffer d_f, const real dt, const bool doTemperatureScaling, gmx::ArrayRef tcstat, @@ -192,7 +188,7 @@ private: int numTempScaleGroupsAlloc_ = -1; //! Vector with diagonal elements of the Parrinello-Rahman pressure coupling velocity rescale factors - float3 prVelocityScalingMatrixDiagonal_; + Float3 prVelocityScalingMatrixDiagonal_; }; } // namespace gmx diff --git a/src/gromacs/mdlib/leapfrog_gpu_sycl.cpp b/src/gromacs/mdlib/leapfrog_gpu_sycl.cpp index 9afb0320fb..b0e2583a3f 100644 --- a/src/gromacs/mdlib/leapfrog_gpu_sycl.cpp +++ b/src/gromacs/mdlib/leapfrog_gpu_sycl.cpp @@ -84,15 +84,15 @@ using cl::sycl::access::mode; template auto leapFrogKernel( cl::sycl::handler& cgh, - DeviceAccessor a_x, - DeviceAccessor a_xp, - DeviceAccessor a_v, - DeviceAccessor a_f, + DeviceAccessor a_x, + DeviceAccessor a_xp, + DeviceAccessor a_v, + DeviceAccessor a_f, DeviceAccessor a_inverseMasses, float dt, OptionalAccessor a_lambdas, OptionalAccessor a_tempScaleGroups, - float3 prVelocityScalingMatrixDiagonal) + Float3 prVelocityScalingMatrixDiagonal) { cgh.require(a_x); cgh.require(a_xp); @@ -109,9 +109,9 @@ auto leapFrogKernel( } return [=](cl::sycl::id<1> itemIdx) { - const float3 x = a_x[itemIdx]; - const float3 v = a_v[itemIdx]; - const float3 f = a_f[itemIdx]; + const Float3 x = a_x[itemIdx]; + const Float3 v = a_v[itemIdx]; + const Float3 f = a_f[itemIdx]; const float im = a_inverseMasses[itemIdx]; const float imdt = im * dt; @@ -137,20 +137,20 @@ auto leapFrogKernel( } }(); - const float3 prVelocityDelta = [=]() { + const Float3 prVelocityDelta = [=]() { if constexpr (velocityScaling == VelocityScalingType::Diagonal) { - return float3{ prVelocityScalingMatrixDiagonal[0] * v[0], + return Float3{ prVelocityScalingMatrixDiagonal[0] * v[0], prVelocityScalingMatrixDiagonal[1] * v[1], prVelocityScalingMatrixDiagonal[2] * v[2] }; } else if constexpr (velocityScaling == VelocityScalingType::None) { - return float3{ 0, 0, 0 }; + return Float3{ 0, 0, 0 }; } }(); - const float3 v_new = v * lambda - prVelocityDelta + f * imdt; + const Float3 v_new = v * lambda - prVelocityDelta + f * imdt; a_v[itemIdx] = v_new; a_x[itemIdx] = x + v_new * dt; }; @@ -216,10 +216,10 @@ static inline cl::sycl::event launchLeapFrogKernel(NumTempScaleValues tempScali prVelocityScalingType); } -void LeapFrogGpu::integrate(DeviceBuffer d_x, - DeviceBuffer d_xp, - DeviceBuffer d_v, - DeviceBuffer d_f, +void LeapFrogGpu::integrate(DeviceBuffer d_x, + DeviceBuffer d_xp, + DeviceBuffer d_v, + DeviceBuffer d_f, const real dt, const bool doTemperatureScaling, gmx::ArrayRef tcstat, @@ -253,7 +253,7 @@ void LeapFrogGpu::integrate(DeviceBuffer d_x, "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported " "in GPU version of Leap-Frog integrator."); prVelocityScalingMatrixDiagonal_ = dtPressureCouple - * float3{ prVelocityScalingMatrix[XX][XX], + * Float3{ prVelocityScalingMatrix[XX][XX], prVelocityScalingMatrix[YY][YY], prVelocityScalingMatrix[ZZ][ZZ] }; } diff --git a/src/gromacs/mdlib/lincs_gpu.cu b/src/gromacs/mdlib/lincs_gpu.cu index 0967c20781..466c250f4c 100644 --- a/src/gromacs/mdlib/lincs_gpu.cu +++ b/src/gromacs/mdlib/lincs_gpu.cu @@ -59,7 +59,8 @@ #include "gromacs/gpu_utils/cuda_arch_utils.cuh" #include "gromacs/gpu_utils/cudautils.cuh" #include "gromacs/gpu_utils/devicebuffer.cuh" -#include "gromacs/gpu_utils/gputraits.cuh" +#include "gromacs/gpu_utils/gputraits.h" +#include "gromacs/gpu_utils/typecasts.cuh" #include "gromacs/gpu_utils/vectype_ops.cuh" #include "gromacs/math/functions.h" #include "gromacs/math/vec.h" @@ -427,14 +428,14 @@ inline auto getLincsKernelPtr(const bool updateVelocities, const bool computeVir return kernelPtr; } -void LincsGpu::apply(const float3* d_x, - float3* d_xp, - const bool updateVelocities, - float3* d_v, - const real invdt, - const bool computeVirial, - tensor virialScaled, - const PbcAiuc pbcAiuc) +void LincsGpu::apply(const DeviceBuffer d_x, + DeviceBuffer d_xp, + const bool updateVelocities, + DeviceBuffer d_v, + const real invdt, + const bool computeVirial, + tensor virialScaled, + const PbcAiuc pbcAiuc) { ensureNoPendingDeviceError("In CUDA version of LINCS"); @@ -479,8 +480,13 @@ void LincsGpu::apply(const float3* d_x, kernelParams_.pbcAiuc = pbcAiuc; - const auto kernelArgs = - prepareGpuKernelArguments(kernelPtr, config, &kernelParams_, &d_x, &d_xp, &d_v, &invdt); + const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, + config, + &kernelParams_, + asFloat3Pointer(&d_x), + asFloat3Pointer(&d_xp), + asFloat3Pointer(&d_v), + &invdt); launchGpuKernel(kernelPtr, config, diff --git a/src/gromacs/mdlib/lincs_gpu.cuh b/src/gromacs/mdlib/lincs_gpu.cuh index 40433efcb7..6507892649 100644 --- a/src/gromacs/mdlib/lincs_gpu.cuh +++ b/src/gromacs/mdlib/lincs_gpu.cuh @@ -46,6 +46,7 @@ #include +#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/device_stream.h" #include "gromacs/gpu_utils/gputraits.cuh" @@ -134,14 +135,14 @@ public: * \param[in,out] virialScaled Scaled virial tensor to be updated. * \param[in] pbcAiuc PBC data. */ - void apply(const float3* d_x, - float3* d_xp, - const bool updateVelocities, - float3* d_v, - const real invdt, - const bool computeVirial, - tensor virialScaled, - const PbcAiuc pbcAiuc); + void apply(const DeviceBuffer d_x, + DeviceBuffer d_xp, + const bool updateVelocities, + DeviceBuffer d_v, + const real invdt, + const bool computeVirial, + tensor virialScaled, + const PbcAiuc pbcAiuc); /*! \brief * Update data-structures (e.g. after NB search step). diff --git a/src/gromacs/mdlib/settle_gpu.cu b/src/gromacs/mdlib/settle_gpu.cu index eb4b495772..0cc25c50ce 100644 --- a/src/gromacs/mdlib/settle_gpu.cu +++ b/src/gromacs/mdlib/settle_gpu.cu @@ -59,7 +59,8 @@ #include "gromacs/gpu_utils/cuda_arch_utils.cuh" #include "gromacs/gpu_utils/cudautils.cuh" #include "gromacs/gpu_utils/devicebuffer.h" -#include "gromacs/gpu_utils/gputraits.cuh" +#include "gromacs/gpu_utils/gputraits.h" +#include "gromacs/gpu_utils/typecasts.cuh" #include "gromacs/gpu_utils/vectype_ops.cuh" #include "gromacs/math/functions.h" #include "gromacs/math/vec.h" @@ -396,14 +397,14 @@ inline auto getSettleKernelPtr(const bool updateVelocities, const bool computeVi return kernelPtr; } -void SettleGpu::apply(const float3* d_x, - float3* d_xp, - const bool updateVelocities, - float3* d_v, - const real invdt, - const bool computeVirial, - tensor virialScaled, - const PbcAiuc pbcAiuc) +void SettleGpu::apply(const DeviceBuffer d_x, + DeviceBuffer d_xp, + const bool updateVelocities, + DeviceBuffer d_v, + const real invdt, + const bool computeVirial, + tensor virialScaled, + const PbcAiuc pbcAiuc) { ensureNoPendingDeviceError("In CUDA version SETTLE"); @@ -440,8 +441,17 @@ void SettleGpu::apply(const float3* d_x, config.sharedMemorySize = 0; } - const auto kernelArgs = prepareGpuKernelArguments( - kernelPtr, config, &numSettles_, &d_atomIds_, &settleParameters_, &d_x, &d_xp, &invdt, &d_v, &d_virialScaled_, &pbcAiuc); + const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, + config, + &numSettles_, + &d_atomIds_, + &settleParameters_, + asFloat3Pointer(&d_x), + asFloat3Pointer(&d_xp), + &invdt, + asFloat3Pointer(&d_v), + &d_virialScaled_, + &pbcAiuc); launchGpuKernel(kernelPtr, config, diff --git a/src/gromacs/mdlib/settle_gpu.cuh b/src/gromacs/mdlib/settle_gpu.cuh index 3a96ec4d39..f09fbd344f 100644 --- a/src/gromacs/mdlib/settle_gpu.cuh +++ b/src/gromacs/mdlib/settle_gpu.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019,2020, by the GROMACS development team, led by + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -45,9 +45,10 @@ #include "gmxpre.h" +#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/gpu_utils/device_context.h" #include "gromacs/gpu_utils/device_stream.h" -#include "gromacs/gpu_utils/gputraits.cuh" +#include "gromacs/gpu_utils/gputraits.h" #include "gromacs/math/functions.h" #include "gromacs/math/invertmatrix.h" #include "gromacs/math/vec.h" @@ -101,14 +102,14 @@ public: * \param[in,out] virialScaled Scaled virial tensor to be updated. * \param[in] pbcAiuc PBC data. */ - void apply(const float3* d_x, - float3* d_xp, - const bool updateVelocities, - float3* d_v, - const real invdt, - const bool computeVirial, - tensor virialScaled, - const PbcAiuc pbcAiuc); + void apply(const DeviceBuffer d_x, + DeviceBuffer d_xp, + const bool updateVelocities, + DeviceBuffer d_v, + const real invdt, + const bool computeVirial, + tensor virialScaled, + const PbcAiuc pbcAiuc); /*! \brief * Update data-structures (e.g. after NB search step). diff --git a/src/gromacs/mdlib/tests/constrtestrunners.cu b/src/gromacs/mdlib/tests/constrtestrunners.cu index 94be8b35f9..c22b91f351 100644 --- a/src/gromacs/mdlib/tests/constrtestrunners.cu +++ b/src/gromacs/mdlib/tests/constrtestrunners.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -52,6 +52,7 @@ #include #include "gromacs/gpu_utils/devicebuffer.cuh" +#include "gromacs/gpu_utils/gputraits.h" #include "gromacs/hardware/device_information.h" #include "gromacs/mdlib/lincs_gpu.cuh" #include "gromacs/pbcutil/pbc.h" @@ -71,9 +72,14 @@ void LincsDeviceConstraintsRunner::applyConstraints(ConstraintsTestData* testDat auto lincsGpu = std::make_unique( testData->ir_.nLincsIter, testData->ir_.nProjOrder, deviceContext, deviceStream); - bool updateVelocities = true; - int numAtoms = testData->numAtoms_; - float3 *d_x, *d_xp, *d_v; + bool updateVelocities = true; + int numAtoms = testData->numAtoms_; + + Float3* h_x = gmx::asGenericFloat3Pointer(testData->x_); + Float3* h_xp = gmx::asGenericFloat3Pointer(testData->xPrime_); + Float3* h_v = gmx::asGenericFloat3Pointer(testData->v_); + + DeviceBuffer d_x, d_xp, d_v; lincsGpu->set(*testData->idef_, testData->numAtoms_, testData->invmass_.data()); PbcAiuc pbcAiuc; @@ -83,24 +89,19 @@ void LincsDeviceConstraintsRunner::applyConstraints(ConstraintsTestData* testDat allocateDeviceBuffer(&d_xp, numAtoms, deviceContext); allocateDeviceBuffer(&d_v, numAtoms, deviceContext); - copyToDeviceBuffer( - &d_x, (float3*)(testData->x_.data()), 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); - copyToDeviceBuffer( - &d_xp, (float3*)(testData->xPrime_.data()), 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&d_x, h_x, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&d_xp, h_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); if (updateVelocities) { - copyToDeviceBuffer( - &d_v, (float3*)(testData->v_.data()), 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&d_v, h_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); } lincsGpu->apply( d_x, d_xp, updateVelocities, d_v, testData->invdt_, testData->computeVirial_, testData->virialScaled_, pbcAiuc); - copyFromDeviceBuffer( - (float3*)(testData->xPrime_.data()), &d_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); + copyFromDeviceBuffer(h_xp, &d_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); if (updateVelocities) { - copyFromDeviceBuffer( - (float3*)(testData->v_.data()), &d_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); + copyFromDeviceBuffer(h_v, &d_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); } freeDeviceBuffer(&d_x); diff --git a/src/gromacs/mdlib/tests/leapfrogtestrunners_gpu.cpp b/src/gromacs/mdlib/tests/leapfrogtestrunners_gpu.cpp index b85ed94b52..97c9c29481 100644 --- a/src/gromacs/mdlib/tests/leapfrogtestrunners_gpu.cpp +++ b/src/gromacs/mdlib/tests/leapfrogtestrunners_gpu.cpp @@ -59,6 +59,7 @@ # include "gromacs/mdlib/leapfrog_gpu.h" #endif +#include "gromacs/gpu_utils/gputraits.h" #include "gromacs/hardware/device_information.h" #include "gromacs/mdlib/stat.h" @@ -76,14 +77,12 @@ void LeapFrogDeviceTestRunner::integrate(LeapFrogTestData* testData, int numStep int numAtoms = testData->numAtoms_; - static_assert(sizeof(float3) == sizeof(*testData->x_.data()), "Incompatible types"); + Float3* h_x = gmx::asGenericFloat3Pointer(testData->x_); + Float3* h_xp = gmx::asGenericFloat3Pointer(testData->xPrime_); + Float3* h_v = gmx::asGenericFloat3Pointer(testData->v_); + Float3* h_f = gmx::asGenericFloat3Pointer(testData->f_); - float3* h_x = reinterpret_cast(testData->x_.data()); - float3* h_xp = reinterpret_cast(testData->xPrime_.data()); - float3* h_v = reinterpret_cast(testData->v_.data()); - float3* h_f = reinterpret_cast(testData->f_.data()); - - DeviceBuffer d_x, d_xp, d_v, d_f; + DeviceBuffer d_x, d_xp, d_v, d_f; allocateDeviceBuffer(&d_x, numAtoms, deviceContext); allocateDeviceBuffer(&d_xp, numAtoms, deviceContext); diff --git a/src/gromacs/mdlib/tests/settletestrunners.cu b/src/gromacs/mdlib/tests/settletestrunners.cu index 0d2a3e116b..7aab750547 100644 --- a/src/gromacs/mdlib/tests/settletestrunners.cu +++ b/src/gromacs/mdlib/tests/settletestrunners.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -52,6 +52,7 @@ #include #include "gromacs/gpu_utils/devicebuffer.cuh" +#include "gromacs/gpu_utils/gputraits.h" #include "gromacs/hardware/device_information.h" #include "gromacs/mdlib/settle_gpu.cuh" #include "gromacs/utility/unique_cptr.h" @@ -85,30 +86,29 @@ void SettleDeviceTestRunner::applySettle(SettleTestData* testData, int numAtoms = testData->numAtoms_; - float3 *d_x, *d_xp, *d_v; + DeviceBuffer d_x, d_xp, d_v; - float3* h_x = (float3*)(as_rvec_array(testData->x_.data())); - float3* h_xp = (float3*)(as_rvec_array(testData->xPrime_.data())); - float3* h_v = (float3*)(as_rvec_array(testData->v_.data())); + Float3* h_x = gmx::asGenericFloat3Pointer(testData->x_); + Float3* h_xp = gmx::asGenericFloat3Pointer(testData->xPrime_); + Float3* h_v = gmx::asGenericFloat3Pointer(testData->v_); allocateDeviceBuffer(&d_x, numAtoms, deviceContext); allocateDeviceBuffer(&d_xp, numAtoms, deviceContext); allocateDeviceBuffer(&d_v, numAtoms, deviceContext); - copyToDeviceBuffer(&d_x, (float3*)h_x, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); - copyToDeviceBuffer(&d_xp, (float3*)h_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&d_x, h_x, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&d_xp, h_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); if (updateVelocities) { - copyToDeviceBuffer(&d_v, (float3*)h_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); + copyToDeviceBuffer(&d_v, h_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); } settleGpu->apply( d_x, d_xp, updateVelocities, d_v, testData->reciprocalTimeStep_, calcVirial, testData->virial_, pbcAiuc); - copyFromDeviceBuffer((float3*)h_xp, &d_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); + copyFromDeviceBuffer(h_xp, &d_xp, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); if (updateVelocities) { - copyFromDeviceBuffer( - (float3*)h_v, &d_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); + copyFromDeviceBuffer(h_v, &d_v, 0, numAtoms, deviceStream, GpuApiCallBehavior::Sync, nullptr); } freeDeviceBuffer(&d_x); diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu index f03ab778c3..b561856511 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu @@ -240,9 +240,9 @@ UpdateConstrainGpu::Impl::Impl(const t_inputrec& ir, UpdateConstrainGpu::Impl::~Impl() {} -void UpdateConstrainGpu::Impl::set(DeviceBuffer d_x, - DeviceBuffer d_v, - const DeviceBuffer d_f, +void UpdateConstrainGpu::Impl::set(DeviceBuffer d_x, + DeviceBuffer d_v, + const DeviceBuffer d_f, const InteractionDefinitions& idef, const t_mdatoms& md) { @@ -254,9 +254,9 @@ void UpdateConstrainGpu::Impl::set(DeviceBuffer d_x, GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null."); GMX_ASSERT(d_f != nullptr, "Forces device buffer should not be null."); - d_x_ = reinterpret_cast(d_x); - d_v_ = reinterpret_cast(d_v); - d_f_ = reinterpret_cast(d_f); + d_x_ = d_x; + d_v_ = d_v; + d_f_ = d_f; numAtoms_ = md.nr; @@ -334,9 +334,9 @@ void UpdateConstrainGpu::scaleVelocities(const matrix scalingMatrix) impl_->scaleVelocities(scalingMatrix); } -void UpdateConstrainGpu::set(DeviceBuffer d_x, - DeviceBuffer d_v, - const DeviceBuffer d_f, +void UpdateConstrainGpu::set(DeviceBuffer d_x, + DeviceBuffer d_v, + const DeviceBuffer d_f, const InteractionDefinitions& idef, const t_mdatoms& md) { diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h index 8e101b8cd0..76e41398e7 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.h +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h @@ -150,9 +150,9 @@ public: * \param[in] idef System topology * \param[in] md Atoms data. */ - void set(DeviceBuffer d_x, - DeviceBuffer d_v, - const DeviceBuffer d_f, + void set(DeviceBuffer d_x, + DeviceBuffer d_v, + const DeviceBuffer d_f, const InteractionDefinitions& idef, const t_mdatoms& md); @@ -193,14 +193,14 @@ private: int numAtoms_; //! Local copy of the pointer to the device positions buffer - float3* d_x_; + DeviceBuffer d_x_; //! Local copy of the pointer to the device velocities buffer - float3* d_v_; + DeviceBuffer d_v_; //! Local copy of the pointer to the device forces buffer - float3* d_f_; + DeviceBuffer d_f_; //! Device buffer for intermediate positions (maintained internally) - float3* d_xp_; + DeviceBuffer d_xp_; //! Number of elements in shifted coordinates buffer int numXp_ = -1; //! Allocation size for the shifted coordinates buffer @@ -208,7 +208,7 @@ private: //! 1/mass for all atoms (GPU) - real* d_inverseMasses_; + DeviceBuffer d_inverseMasses_; //! Number of elements in reciprocal masses buffer int numInverseMasses_ = -1; //! Allocation size for the reciprocal masses buffer diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index bd5fa8de5d..76341b7a97 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -778,7 +778,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, nbnxnInsertNonlocalGpuDependency(nb, interactionLoc); } -void* getGpuForces(NbnxmGpu* nb) +DeviceBuffer getGpuForces(NbnxmGpu* nb) { return nb->atdat->f; } diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index a8e919ea01..196befc92d 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -213,7 +213,7 @@ int nonbonded_verlet_t::getNumAtoms(const gmx::AtomLocality locality) const return numAtoms; } -void* nonbonded_verlet_t::getGpuForces() const +DeviceBuffer nonbonded_verlet_t::getGpuForces() const { return Nbnxm::getGpuForces(gpu_nbv); } diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index 30380ebeb2..a36efad023 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -395,7 +395,7 @@ public: * * \returns A pointer to the force buffer in GPU memory */ - void* getGpuForces() const; + DeviceBuffer getGpuForces() const; //! Return the kernel setup const Nbnxm::KernelSetup& kernelSetup() const { return kernelSetup_; } diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index fec64bbe68..ebdba45b9d 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -313,7 +313,8 @@ void nbnxn_wait_x_on_device(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM; * \returns A pointer to the force buffer in GPU memory */ CUDA_FUNC_QUALIFIER -void* getGpuForces(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr); +DeviceBuffer getGpuForces(NbnxmGpu gmx_unused* nb) + CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer{}); } // namespace Nbnxm #endif -- 2.22.0