From c5c220a03663d975e31e8573c1849247ce3f8ad0 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Mon, 20 Jan 2020 18:32:46 +0100 Subject: [PATCH] Use RVec instead of float for x, v and f device buffers Using RVec instead of float for coordinates data-types allows to remove multiplications by DIM when the adresses, offsets and sizes are computed. Since the native device types are not used in CPU part of the code, the type casting remains. Refs #3312 and #2936 Change-Id: Iaea914a474195f214ca860f7345f6878b9a04813 --- src/gromacs/domdec/gpuhaloexchange.h | 4 +- src/gromacs/domdec/gpuhaloexchange_impl.cpp | 6 +- src/gromacs/domdec/gpuhaloexchange_impl.cu | 8 +- src/gromacs/ewald/pme.h | 4 +- src/gromacs/ewald/pme_calculate_splines.cuh | 28 +++++- .../ewald/pme_coordinate_receiver_gpu.h | 3 +- .../pme_coordinate_receiver_gpu_impl.cpp | 2 +- .../ewald/pme_coordinate_receiver_gpu_impl.cu | 6 +- .../ewald/pme_coordinate_receiver_gpu_impl.h | 2 +- src/gromacs/ewald/pme_gather.cu | 20 ++--- src/gromacs/ewald/pme_gpu.cpp | 2 +- src/gromacs/ewald/pme_gpu_internal.cpp | 2 +- src/gromacs/ewald/pme_gpu_internal.h | 2 +- src/gromacs/ewald/pme_gpu_types.h | 2 +- src/gromacs/ewald/pme_spread.cu | 17 ++-- src/gromacs/gpu_utils/cudautils.cuh | 13 +++ src/gromacs/gpu_utils/devicebuffer_ocl.h | 2 +- src/gromacs/gpu_utils/tests/CMakeLists.txt | 3 +- src/gromacs/gpu_utils/tests/gpu_utils.cpp | 89 +++++++++++++++++++ src/gromacs/mdlib/update_constrain_gpu.h | 12 +-- .../mdlib/update_constrain_gpu_impl.cpp | 6 +- .../mdlib/update_constrain_gpu_impl.cu | 24 ++--- src/gromacs/mdlib/update_constrain_gpu_impl.h | 12 +-- .../mdtypes/state_propagator_data_gpu.h | 6 +- .../state_propagator_data_gpu_impl.cpp | 12 +-- .../mdtypes/state_propagator_data_gpu_impl.h | 16 ++-- .../state_propagator_data_gpu_impl_gpu.cpp | 42 ++++----- src/gromacs/nbnxm/atomdata.cpp | 4 +- src/gromacs/nbnxm/atomdata.h | 14 +-- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 15 ++-- src/gromacs/nbnxm/nbnxm.cpp | 4 +- src/gromacs/nbnxm/nbnxm.h | 10 +-- src/gromacs/nbnxm/nbnxm_gpu.h | 6 +- 33 files changed, 257 insertions(+), 141 deletions(-) create mode 100644 src/gromacs/gpu_utils/tests/gpu_utils.cpp diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h index b7e6ff54fa..d32b1800c4 100644 --- a/src/gromacs/domdec/gpuhaloexchange.h +++ b/src/gromacs/domdec/gpuhaloexchange.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019, by the GROMACS development team, led by + * Copyright (c) 2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -92,7 +92,7 @@ public: * \param [in] d_coordinateBuffer pointer to coordinates buffer in GPU memory * \param [in] d_forcesBuffer pointer to coordinates buffer in GPU memory */ - void reinitHalo(DeviceBuffer d_coordinateBuffer, DeviceBuffer d_forcesBuffer); + void reinitHalo(DeviceBuffer d_coordinateBuffer, DeviceBuffer d_forcesBuffer); /*! \brief GPU halo exchange of coordinates buffer. diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp index 2511673218..a17c550c6c 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019, by the GROMACS development team, led by + * Copyright (c) 2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -73,8 +73,8 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */, GpuHaloExchange::~GpuHaloExchange() = default; /*!\brief init halo exhange stub. */ -void GpuHaloExchange::reinitHalo(DeviceBuffer /* d_coordinatesBuffer */, - DeviceBuffer /* d_forcesBuffer */) +void GpuHaloExchange::reinitHalo(DeviceBuffer /* d_coordinatesBuffer */, + DeviceBuffer /* d_forcesBuffer */) { GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation."); diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu index 660566a9dd..4313ffacb0 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cu +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019, by the GROMACS development team, led by + * Copyright (c) 2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -57,6 +57,7 @@ #include "gromacs/gpu_utils/devicebuffer.h" #include "gromacs/gpu_utils/gpueventsynchronizer.cuh" #include "gromacs/gpu_utils/vectype_ops.cuh" +#include "gromacs/math/vectypes.h" #include "gromacs/pbcutil/ishift.h" #include "gromacs/utility/gmxmpi.h" @@ -435,10 +436,9 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void GpuHaloExchange::~GpuHaloExchange() = default; -void GpuHaloExchange::reinitHalo(DeviceBuffer d_coordinatesBuffer, DeviceBuffer d_forcesBuffer) +void GpuHaloExchange::reinitHalo(DeviceBuffer d_coordinatesBuffer, DeviceBuffer d_forcesBuffer) { - impl_->reinitHalo(reinterpret_cast(d_coordinatesBuffer), - reinterpret_cast(d_forcesBuffer)); + impl_->reinitHalo(asFloat3(d_coordinatesBuffer), asFloat3(d_forcesBuffer)); } void GpuHaloExchange::communicateHaloCoordinates(const matrix box, diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h index edbe283523..f1f829e445 100644 --- a/src/gromacs/ewald/pme.h +++ b/src/gromacs/ewald/pme.h @@ -451,8 +451,8 @@ GPU_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t* GPU_FUNC_ARG * \param[in] pme The PME data structure. * \param[in] d_x The pointer to the positions buffer to be set */ -GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme), - DeviceBuffer GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM; +GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme), + DeviceBuffer GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM; /*! \brief Get pointer to device copy of force data. * \param[in] pme The PME data structure. diff --git a/src/gromacs/ewald/pme_calculate_splines.cuh b/src/gromacs/ewald/pme_calculate_splines.cuh index f52c81c599..4e5dc6e888 100644 --- a/src/gromacs/ewald/pme_calculate_splines.cuh +++ b/src/gromacs/ewald/pme_calculate_splines.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -52,6 +52,30 @@ //! Controls if the atom and charge data is prefeched into shared memory or loaded per thread from global static const bool c_useAtomDataPrefetch = true; +/*! \brief Asserts if the argument is finite. + * + * The function works for any data type, that can be casted to float. Note that there is also + * a specialized implementation for float3 data type. + * + * \param[in] arg Argument to check. + */ +template +__device__ inline void assertIsFinite(T arg); + +template<> +__device__ inline void assertIsFinite(float3 arg) +{ + assert(isfinite(float(arg.x))); + assert(isfinite(float(arg.y))); + assert(isfinite(float(arg.z))); +} + +template +__device__ inline void assertIsFinite(T arg) +{ + assert(isfinite(float(arg))); +} + /*! \brief * General purpose function for loading atom-related data from global to shared memory. * @@ -79,7 +103,7 @@ __device__ __forceinline__ void pme_gpu_stage_atom_data(const PmeGpuCudaKernelPa pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom); if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck) { - assert(isfinite(float(gm_source[globalIndex]))); + assertIsFinite(gm_source[globalIndex]); sm_destination[localIndex] = gm_source[globalIndex]; } } diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h index 5e4c4967ba..b5d02a719e 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h @@ -43,6 +43,7 @@ #define GMX_PMECOORDINATERECEIVERGPU_H #include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gromacs/math/vectypes.h" #include "gromacs/utility/classhelpers.h" #include "gromacs/utility/gmxmpi.h" @@ -70,7 +71,7 @@ public: * send coordinates buffer address to PP rank * \param[in] d_x coordinates buffer in GPU memory */ - void sendCoordinateBufferAddressToPpRanks(DeviceBuffer d_x); + void sendCoordinateBufferAddressToPpRanks(DeviceBuffer d_x); /*! \brief diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp index c026d41b31..0cb848e6c0 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp @@ -75,7 +75,7 @@ PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const void* /* pmeStream */, PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default; /*!\brief init PME-PP GPU communication stub */ -void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer /* d_x */) +void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer /* d_x */) { GMX_ASSERT(false, "A CPU stub for PME-PP GPU communication initialization was called instead of the " diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu index 1584a9e844..b2e7fa009d 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu @@ -69,7 +69,7 @@ PmeCoordinateReceiverGpu::Impl::Impl(const void* pmeStream, MPI_Comm comm, gmx:: PmeCoordinateReceiverGpu::Impl::~Impl() = default; -void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer d_x) +void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer d_x) { int ind_start = 0; @@ -80,7 +80,7 @@ void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(Device ind_end = ind_start + receiver.numAtoms; // Data will be transferred directly from GPU. - void* sendBuf = reinterpret_cast(&d_x[ind_start * DIM]); + void* sendBuf = reinterpret_cast(&d_x[ind_start]); #if GMX_MPI MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_); @@ -131,7 +131,7 @@ PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const void* pmeStr PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default; -void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer d_x) +void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer d_x) { impl_->sendCoordinateBufferAddressToPpRanks(d_x); } diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h index 874fb2747e..4f3bbe2e4e 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h @@ -69,7 +69,7 @@ public: * send coordinates buffer address to PP rank * \param[in] d_x coordinates buffer in GPU memory */ - void sendCoordinateBufferAddressToPpRanks(DeviceBuffer d_x); + void sendCoordinateBufferAddressToPpRanks(DeviceBuffer d_x); /*! \brief * launch receive of coordinate data from PP rank diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu index 8b2ff5f80e..616516df23 100644 --- a/src/gromacs/ewald/pme_gather.cu +++ b/src/gromacs/ewald/pme_gather.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -44,6 +44,7 @@ #include #include "gromacs/gpu_utils/cuda_kernel_utils.cuh" +#include "gromacs/gpu_utils/cudautils.cuh" #include "pme.cuh" #include "pme_calculate_splines.cuh" @@ -321,32 +322,27 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__ } else { + const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates); /* Recaclulate Splines */ if (c_useAtomDataPrefetch) { // charges __shared__ float sm_coefficients[atomsPerBlock]; // Coordinates - __shared__ float sm_coordinates[DIM * atomsPerBlock]; + __shared__ float3 sm_coordinates[atomsPerBlock]; /* Staging coefficients/charges */ - pme_gpu_stage_atom_data(kernelParams, sm_coefficients, - kernelParams.atoms.d_coefficients); + pme_gpu_stage_atom_data(kernelParams, sm_coefficients, gm_coefficients); /* Staging coordinates */ - pme_gpu_stage_atom_data(kernelParams, sm_coordinates, - kernelParams.atoms.d_coordinates); + pme_gpu_stage_atom_data(kernelParams, sm_coordinates, gm_coordinates); __syncthreads(); - atomX.x = sm_coordinates[atomIndexLocal * DIM + XX]; - atomX.y = sm_coordinates[atomIndexLocal * DIM + YY]; - atomX.z = sm_coordinates[atomIndexLocal * DIM + ZZ]; + atomX = sm_coordinates[atomIndexLocal]; atomCharge = sm_coefficients[atomIndexLocal]; } else { + atomX = gm_coordinates[atomIndexGlobal]; atomCharge = gm_coefficients[atomIndexGlobal]; - atomX.x = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + XX]; - atomX.y = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + YY]; - atomX.z = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + ZZ]; } calculate_splines( kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices); diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp index aafd5c1a4d..44fafc205c 100644 --- a/src/gromacs/ewald/pme_gpu.cpp +++ b/src/gromacs/ewald/pme_gpu.cpp @@ -440,7 +440,7 @@ void* pme_gpu_get_device_f(const gmx_pme_t* pme) return pme_gpu_get_kernelparam_forces(pme->gpu); } -void pme_gpu_set_device_x(const gmx_pme_t* pme, DeviceBuffer d_x) +void pme_gpu_set_device_x(const gmx_pme_t* pme, DeviceBuffer d_x) { GMX_ASSERT(pme != nullptr, "Null pointer is passed as a PME to the set coordinates function."); GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index 9c98402525..1aa2052907 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -1523,7 +1523,7 @@ void* pme_gpu_get_kernelparam_forces(const PmeGpu* pmeGpu) } } -void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer d_x) +void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer d_x) { GMX_ASSERT(pmeGpu && pmeGpu->kernelParams, "PME GPU device buffer can not be set in non-GPU builds or before the GPU PME was " diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h index cc7e9d1f34..2816a22add 100644 --- a/src/gromacs/ewald/pme_gpu_internal.h +++ b/src/gromacs/ewald/pme_gpu_internal.h @@ -395,7 +395,7 @@ GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu* GPU_FUNC_ARGUMENT( * \param[in] d_x Pointer to coordinate data */ GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu), - DeviceBuffer GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM; + DeviceBuffer GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM; /*! \brief Return pointer to device copy of force data. * \param[in] pmeGpu The PME GPU structure. diff --git a/src/gromacs/ewald/pme_gpu_types.h b/src/gromacs/ewald/pme_gpu_types.h index be501f2cfa..5ddd79fb5a 100644 --- a/src/gromacs/ewald/pme_gpu_types.h +++ b/src/gromacs/ewald/pme_gpu_types.h @@ -157,7 +157,7 @@ struct PmeGpuAtomParams * The coordinates themselves change and need to be copied to the GPU for every PME computation, * but reallocation happens only at DD. */ - HIDE_FROM_OPENCL_COMPILER(DeviceBuffer) d_coordinates; + HIDE_FROM_OPENCL_COMPILER(DeviceBuffer) d_coordinates; /*! \brief Global GPU memory array handle with input atom charges. * The charges only need to be reallocated and copied to the GPU at DD step. */ diff --git a/src/gromacs/ewald/pme_spread.cu b/src/gromacs/ewald/pme_spread.cu index 3d02e43d0d..99f7828c86 100644 --- a/src/gromacs/ewald/pme_spread.cu +++ b/src/gromacs/ewald/pme_spread.cu @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2004, The GROMACS development team. - * Copyright (c) 2013-2016,2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2013-2016,2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -47,6 +47,7 @@ #include #include "gromacs/gpu_utils/cuda_kernel_utils.cuh" +#include "gromacs/gpu_utils/cudautils.cuh" #include "pme.cuh" #include "pme_calculate_splines.cuh" @@ -228,24 +229,20 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU if (computeSplines) { + const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates); if (c_useAtomDataPrefetch) { // Coordinates - __shared__ float sm_coordinates[DIM * atomsPerBlock]; + __shared__ float3 sm_coordinates[atomsPerBlock]; /* Staging coordinates */ - pme_gpu_stage_atom_data(kernelParams, sm_coordinates, - kernelParams.atoms.d_coordinates); + pme_gpu_stage_atom_data(kernelParams, sm_coordinates, gm_coordinates); __syncthreads(); - atomX.x = sm_coordinates[atomIndexLocal * DIM + XX]; - atomX.y = sm_coordinates[atomIndexLocal * DIM + YY]; - atomX.z = sm_coordinates[atomIndexLocal * DIM + ZZ]; + atomX = sm_coordinates[atomIndexLocal]; } else { - atomX.x = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + XX]; - atomX.y = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + YY]; - atomX.z = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + ZZ]; + atomX = gm_coordinates[atomIndexGlobal]; } calculate_splines( kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, &dtheta, sm_gridlineIndices); diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh index 71d9b7dac4..6f41e8589c 100644 --- a/src/gromacs/gpu_utils/cudautils.cuh +++ b/src/gromacs/gpu_utils/cudautils.cuh @@ -215,6 +215,19 @@ static inline void rvec_inc(rvec a, const float3 b) rvec tmp = { b.x, b.y, b.z }; rvec_inc(a, tmp); } +/*! \brief Cast RVec buffer to float3 buffer. + * + * \param[in] in The RVec buffer to cast. + * + * \returns Buffer, casted to float3*. + */ +static inline __host__ __device__ float3* asFloat3(gmx::RVec* in) +{ + static_assert(sizeof(in[0]) == sizeof(float3), + "Size of the host-side data-type is different from the size of the device-side " + "counterpart."); + return reinterpret_cast(in); +} /*! \brief Wait for all taks in stream \p s to complete. * diff --git a/src/gromacs/gpu_utils/devicebuffer_ocl.h b/src/gromacs/gpu_utils/devicebuffer_ocl.h index 2c92c81c1b..40f1e12941 100644 --- a/src/gromacs/gpu_utils/devicebuffer_ocl.h +++ b/src/gromacs/gpu_utils/devicebuffer_ocl.h @@ -228,7 +228,7 @@ void clearDeviceBufferAsync(DeviceBuffer* buffer, size_t startingOffs GMX_ASSERT(buffer, "needs a buffer pointer"); const size_t offset = startingOffset * sizeof(ValueType); const size_t bytes = numValues * sizeof(ValueType); - const ValueType pattern = 0; + const int pattern = 0; const cl_uint numWaitEvents = 0; const cl_event* waitEvents = nullptr; cl_event commandEvent; diff --git a/src/gromacs/gpu_utils/tests/CMakeLists.txt b/src/gromacs/gpu_utils/tests/CMakeLists.txt index c2fc3426bb..e9a62b792b 100644 --- a/src/gromacs/gpu_utils/tests/CMakeLists.txt +++ b/src/gromacs/gpu_utils/tests/CMakeLists.txt @@ -1,7 +1,7 @@ # # This file is part of the GROMACS molecular simulation package. # -# Copyright (c) 2017,2018,2019, by the GROMACS development team, led by +# Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -48,6 +48,7 @@ if(GMX_USE_CUDA) # CUDA-only test list(APPEND SOURCES_FROM_CXX pinnedmemorychecker.cpp + gpu_utils.cpp ) # TODO Making a separate library is heavy handed, but nothing else # seems to work. Also don't use a hyphen in its name, because nvcc diff --git a/src/gromacs/gpu_utils/tests/gpu_utils.cpp b/src/gromacs/gpu_utils/tests/gpu_utils.cpp new file mode 100644 index 0000000000..5adf0a72a7 --- /dev/null +++ b/src/gromacs/gpu_utils/tests/gpu_utils.cpp @@ -0,0 +1,89 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * \brief + * Tests for CUDA float3 type layout. + * + * \author Artem Zhmurov + */ +#include "gmxpre.h" + +#include "config.h" + +#include + +#ifndef __CUDA_ARCH__ +/*! \brief Dummy definition to avoid compiler error + * + * \todo Find a better solution. Probably, move asFloat3(...) function to different header. + */ +# define __CUDA_ARCH__ -1 +# include +# undef __CUDA_ARCH__ +#else +# include +#endif +#include + +#include "gromacs/gpu_utils/cudautils.cuh" +#include "gromacs/math/vectypes.h" +#include "gromacs/utility/real.h" + +#if GMX_GPU == GMX_GPU_CUDA + +namespace gmx +{ + +namespace test +{ + +TEST(GpuDataTypesCompatibilityTest, RVecAndFloat3) +{ + std::vector dataRVec; + dataRVec.emplace_back(1.0, 2.0, 3.0); + dataRVec.emplace_back(4.0, 5.0, 6.0); + float3* dataFloat3 = asFloat3(dataRVec.data()); + EXPECT_EQ(dataFloat3[0].x, dataRVec[0][XX]); + EXPECT_EQ(dataFloat3[0].y, dataRVec[0][YY]); + EXPECT_EQ(dataFloat3[0].z, dataRVec[0][ZZ]); + EXPECT_EQ(dataFloat3[1].x, dataRVec[1][XX]); + EXPECT_EQ(dataFloat3[1].y, dataRVec[1][YY]); + EXPECT_EQ(dataFloat3[1].z, dataRVec[1][ZZ]); +} + +} // namespace test +} // namespace gmx + +#endif // GMX_GPU == GMX_GPU_CUDA \ No newline at end of file diff --git a/src/gromacs/mdlib/update_constrain_gpu.h b/src/gromacs/mdlib/update_constrain_gpu.h index e0190c367f..359ecea5c1 100644 --- a/src/gromacs/mdlib/update_constrain_gpu.h +++ b/src/gromacs/mdlib/update_constrain_gpu.h @@ -133,12 +133,12 @@ public: * \param[in] md Atoms data. * \param[in] numTempScaleValues Number of temperature scaling groups. Zero for no temperature scaling. */ - void set(DeviceBuffer d_x, - DeviceBuffer d_v, - DeviceBuffer d_f, - const t_idef& idef, - const t_mdatoms& md, - int numTempScaleValues); + void set(DeviceBuffer d_x, + DeviceBuffer d_v, + DeviceBuffer d_f, + const t_idef& idef, + const t_mdatoms& md, + int numTempScaleValues); /*! \brief * Update PBC data. diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp index 919ba9b36b..47671ef7de 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp @@ -88,9 +88,9 @@ void UpdateConstrainGpu::scaleCoordinates(const matrix /* scalingMatrix */) "A CPU stub for UpdateConstrain was called instead of the correct implementation."); } -void UpdateConstrainGpu::set(DeviceBuffer /* d_x */, - DeviceBuffer /* d_v */, - const DeviceBuffer /* d_f */, +void UpdateConstrainGpu::set(DeviceBuffer /* d_x */, + DeviceBuffer /* d_v */, + const DeviceBuffer /* d_f */, const t_idef& /* idef */, const t_mdatoms& /* md */, const int /* numTempScaleValues */) diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu index ebbe15a02f..6991ef0dc3 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu @@ -188,12 +188,12 @@ UpdateConstrainGpu::Impl::Impl(const t_inputrec& ir, UpdateConstrainGpu::Impl::~Impl() {} -void UpdateConstrainGpu::Impl::set(DeviceBuffer d_x, - DeviceBuffer d_v, - const DeviceBuffer d_f, - const t_idef& idef, - const t_mdatoms& md, - const int numTempScaleValues) +void UpdateConstrainGpu::Impl::set(DeviceBuffer d_x, + DeviceBuffer d_v, + const DeviceBuffer d_f, + const t_idef& idef, + const t_mdatoms& md, + const int numTempScaleValues) { GMX_ASSERT(d_x != nullptr, "Coordinates device buffer should not be null."); GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null."); @@ -259,12 +259,12 @@ void UpdateConstrainGpu::scaleCoordinates(const matrix scalingMatrix) impl_->scaleCoordinates(scalingMatrix); } -void UpdateConstrainGpu::set(DeviceBuffer d_x, - DeviceBuffer d_v, - const DeviceBuffer d_f, - const t_idef& idef, - const t_mdatoms& md, - const int numTempScaleValues) +void UpdateConstrainGpu::set(DeviceBuffer d_x, + DeviceBuffer d_v, + const DeviceBuffer d_f, + const t_idef& idef, + const t_mdatoms& md, + const int numTempScaleValues) { impl_->set(d_x, d_v, d_f, idef, md, numTempScaleValues); } diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h index 5a28045afd..0009112dc6 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.h +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h @@ -133,12 +133,12 @@ public: * \param[in] md Atoms data. * \param[in] numTempScaleValues Number of temperature scaling groups. Set zero for no temperature coupling. */ - void set(DeviceBuffer d_x, - DeviceBuffer d_v, - const DeviceBuffer d_f, - const t_idef& idef, - const t_mdatoms& md, - const int numTempScaleValues); + void set(DeviceBuffer d_x, + DeviceBuffer d_v, + const DeviceBuffer d_f, + const t_idef& idef, + const t_mdatoms& md, + const int numTempScaleValues); /*! \brief * Update PBC data. diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h index 44cbc8d7e5..d75cd78ea1 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h @@ -184,7 +184,7 @@ public: * * \returns GPU positions buffer. */ - DeviceBuffer getCoordinates(); + DeviceBuffer getCoordinates(); /*! \brief Copy positions to the GPU memory. * @@ -245,7 +245,7 @@ public: * * \returns GPU velocities buffer. */ - DeviceBuffer getVelocities(); + DeviceBuffer getVelocities(); /*! \brief Copy velocities to the GPU memory. * @@ -280,7 +280,7 @@ public: * * \returns GPU force buffer. */ - DeviceBuffer getForces(); + DeviceBuffer getForces(); /*! \brief Copy forces to the GPU memory. * diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp index ae6bcd6c71..15f054eafa 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp @@ -95,12 +95,12 @@ std::tuple StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomL return std::make_tuple(0, 0); } -DeviceBuffer StatePropagatorDataGpu::getCoordinates() +DeviceBuffer StatePropagatorDataGpu::getCoordinates() { GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from " "GPU implementation."); - return DeviceBuffer{}; + return {}; } GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent( @@ -153,12 +153,12 @@ void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef /* } -DeviceBuffer StatePropagatorDataGpu::getVelocities() +DeviceBuffer StatePropagatorDataGpu::getVelocities() { GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from " "GPU implementation."); - return DeviceBuffer{}; + return {}; } void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef /* h_v */, @@ -193,12 +193,12 @@ void StatePropagatorDataGpu::waitVelocitiesReadyOnHost(AtomLocality /* atomLocal } -DeviceBuffer StatePropagatorDataGpu::getForces() +DeviceBuffer StatePropagatorDataGpu::getForces() { GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called instead of one from " "GPU implementation."); - return DeviceBuffer{}; + return {}; } void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef /* h_f */, diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h index 1423277722..a138bcbb62 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h @@ -181,7 +181,7 @@ public: * * \returns GPU positions buffer. */ - DeviceBuffer getCoordinates(); + DeviceBuffer getCoordinates(); /*! \brief Copy positions to the GPU memory. * @@ -242,7 +242,7 @@ public: * * \returns GPU velocities buffer. */ - DeviceBuffer getVelocities(); + DeviceBuffer getVelocities(); /*! \brief Copy velocities to the GPU memory. * @@ -277,7 +277,7 @@ public: * * \returns GPU force buffer. */ - DeviceBuffer getForces(); + DeviceBuffer getForces(); /*! \brief Copy forces to the GPU memory. * @@ -395,21 +395,21 @@ private: int numAtomsAll_ = -1; //! Device positions buffer - DeviceBuffer d_x_; + DeviceBuffer d_x_; //! Number of particles saved in the positions buffer int d_xSize_ = -1; //! Allocation size for the positions buffer int d_xCapacity_ = -1; //! Device velocities buffer - DeviceBuffer d_v_; + DeviceBuffer d_v_; //! Number of particles saved in the velocities buffer int d_vSize_ = -1; //! Allocation size for the velocities buffer int d_vCapacity_ = -1; //! Device force buffer - DeviceBuffer d_f_; + DeviceBuffer d_f_; //! Number of particles saved in the force buffer int d_fSize_ = -1; //! Allocation size for the force buffer @@ -428,7 +428,7 @@ private: * \param[in] atomLocality If all, local or non-local ranges should be copied. * \param[in] commandStream GPU stream to execute copy in. */ - void copyToDevice(DeviceBuffer d_data, + void copyToDevice(DeviceBuffer d_data, gmx::ArrayRef h_data, int dataSize, AtomLocality atomLocality, @@ -443,7 +443,7 @@ private: * \param[in] commandStream GPU stream to execute copy in. */ void copyFromDevice(gmx::ArrayRef h_data, - DeviceBuffer d_data, + DeviceBuffer d_data, int dataSize, AtomLocality atomLocality, CommandStream commandStream); diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp index 031327e80d..4b385a5a7b 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -193,18 +193,18 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll) numAtomsPadded = numAtomsAll_; } - reallocateDeviceBuffer(&d_x_, DIM * numAtomsPadded, &d_xSize_, &d_xCapacity_, deviceContext_); + reallocateDeviceBuffer(&d_x_, numAtomsPadded, &d_xSize_, &d_xCapacity_, deviceContext_); const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_; if (paddingAllocationSize > 0) { // The PME stream is used here because the padding region of d_x_ is only in the PME task. - clearDeviceBufferAsync(&d_x_, DIM * numAtomsAll_, DIM * paddingAllocationSize, pmeStream_); + clearDeviceBufferAsync(&d_x_, numAtomsAll_, paddingAllocationSize, pmeStream_); } - reallocateDeviceBuffer(&d_v_, DIM * numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_); + reallocateDeviceBuffer(&d_v_, numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_); const int d_fOldCapacity = d_fCapacity_; - reallocateDeviceBuffer(&d_f_, DIM * numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_); + reallocateDeviceBuffer(&d_f_, numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_); // Clearing of the forces can be done in local stream since the nonlocal stream cannot reach // the force accumulation stage before syncing with the local stream. Only done in CUDA, // since the force buffer ops are not implemented in OpenCL. @@ -249,7 +249,7 @@ std::tuple StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality return std::make_tuple(atomsStartAt, numAtomsToCopy); } -void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer d_data, +void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer d_data, const gmx::ArrayRef h_data, int dataSize, AtomLocality atomLocality, @@ -269,18 +269,15 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer int atomsStartAt, numAtomsToCopy; std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality); - int elementsStartAt = atomsStartAt * DIM; - int numElementsToCopy = numAtomsToCopy * DIM; - if (numAtomsToCopy != 0) { - GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize, + GMX_ASSERT(atomsStartAt + numAtomsToCopy <= dataSize, "The device allocation is smaller than requested copy range."); GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(), "The host buffer is smaller than the requested copy range."); - copyToDeviceBuffer(&d_data, reinterpret_cast(&h_data.data()[atomsStartAt]), - elementsStartAt, numElementsToCopy, commandStream, transferKind_, nullptr); + copyToDeviceBuffer(&d_data, reinterpret_cast(&h_data.data()[atomsStartAt]), + atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr); } wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); @@ -288,7 +285,7 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer } void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef h_data, - DeviceBuffer d_data, + DeviceBuffer d_data, int dataSize, AtomLocality atomLocality, CommandStream commandStream) @@ -307,25 +304,22 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef h_dat int atomsStartAt, numAtomsToCopy; std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality); - int elementsStartAt = atomsStartAt * DIM; - int numElementsToCopy = numAtomsToCopy * DIM; - if (numAtomsToCopy != 0) { - GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize, + GMX_ASSERT(atomsStartAt + numAtomsToCopy <= dataSize, "The device allocation is smaller than requested copy range."); GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(), "The host buffer is smaller than the requested copy range."); - copyFromDeviceBuffer(reinterpret_cast(&h_data.data()[atomsStartAt]), &d_data, - elementsStartAt, numElementsToCopy, commandStream, transferKind_, nullptr); + copyFromDeviceBuffer(reinterpret_cast(&h_data.data()[atomsStartAt]), &d_data, + atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr); } wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA); wallcycle_stop(wcycle_, ewcLAUNCH_GPU); } -DeviceBuffer StatePropagatorDataGpu::Impl::getCoordinates() +DeviceBuffer StatePropagatorDataGpu::Impl::getCoordinates() { return d_x_; } @@ -422,7 +416,7 @@ void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality atomL } -DeviceBuffer StatePropagatorDataGpu::Impl::getVelocities() +DeviceBuffer StatePropagatorDataGpu::Impl::getVelocities() { return d_v_; } @@ -476,7 +470,7 @@ void StatePropagatorDataGpu::Impl::waitVelocitiesReadyOnHost(AtomLocality atomLo } -DeviceBuffer StatePropagatorDataGpu::Impl::getForces() +DeviceBuffer StatePropagatorDataGpu::Impl::getForces() { return d_f_; } @@ -595,7 +589,7 @@ std::tuple StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomL } -DeviceBuffer StatePropagatorDataGpu::getCoordinates() +DeviceBuffer StatePropagatorDataGpu::getCoordinates() { return impl_->getCoordinates(); } @@ -635,7 +629,7 @@ void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality atomLocalit } -DeviceBuffer StatePropagatorDataGpu::getVelocities() +DeviceBuffer StatePropagatorDataGpu::getVelocities() { return impl_->getVelocities(); } @@ -662,7 +656,7 @@ void StatePropagatorDataGpu::waitVelocitiesReadyOnHost(AtomLocality atomLocality } -DeviceBuffer StatePropagatorDataGpu::getForces() +DeviceBuffer StatePropagatorDataGpu::getForces() { return impl_->getForces(); } diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp index 89d2e762b8..cd5e346303 100644 --- a/src/gromacs/nbnxm/atomdata.cpp +++ b/src/gromacs/nbnxm/atomdata.cpp @@ -1075,7 +1075,7 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet& gridSet, const gmx::AtomLocality locality, bool fillLocal, NbnxmGpu* gpu_nbv, - DeviceBuffer d_x, + DeviceBuffer d_x, GpuEventSynchronizer* xReadyOnDevice) { @@ -1459,7 +1459,7 @@ void reduceForces(nbnxn_atomdata_t* nbat, const gmx::AtomLocality locality, cons /* Add the force array(s) from nbnxn_atomdata_t to f */ void reduceForcesGpu(const gmx::AtomLocality locality, - DeviceBuffer totalForcesDevice, + DeviceBuffer totalForcesDevice, const Nbnxm::GridSet& gridSet, void* pmeForcesDevice, gmx::ArrayRef dependencyList, diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h index ceb87f71f0..22d115706e 100644 --- a/src/gromacs/nbnxm/atomdata.h +++ b/src/gromacs/nbnxm/atomdata.h @@ -376,12 +376,12 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet& gridSet, * \param[in] d_x Coordinates to be copied (in plain rvec format). * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory. */ -void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet& gridSet, - gmx::AtomLocality locality, - bool fillLocal, - NbnxmGpu* gpu_nbv, - DeviceBuffer d_x, - GpuEventSynchronizer* xReadyOnDevice); +void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet& gridSet, + gmx::AtomLocality locality, + bool fillLocal, + NbnxmGpu* gpu_nbv, + DeviceBuffer d_x, + GpuEventSynchronizer* xReadyOnDevice); /*! \brief Add the computed forces to \p f, an internal reduction might be performed as well * @@ -404,7 +404,7 @@ void reduceForces(nbnxn_atomdata_t* nbat, gmx::AtomLocality locality, const Nbnx * \param[in] accumulateForce Whether there are usefull data already in the total force buffer. */ void reduceForcesGpu(gmx::AtomLocality locality, - DeviceBuffer totalForcesDevice, + DeviceBuffer totalForcesDevice, const Nbnxm::GridSet& gridSet, void* pmeForcesDevice, gmx::ArrayRef dependencyList, diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 94e99879db..2defa174d6 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -818,7 +818,7 @@ void cuda_set_cacheconfig() void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, bool setFillerCoords, NbnxmGpu* nb, - DeviceBuffer d_x, + DeviceBuffer d_x, GpuEventSynchronizer* xReadyOnDevice, const Nbnxm::AtomLocality locality, int gridId, @@ -862,12 +862,13 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, auto kernelFn = setFillerCoords ? nbnxn_gpu_x_to_nbat_x_kernel : nbnxn_gpu_x_to_nbat_x_kernel; float4* d_xq = adat->xq; + float3* d_xFloat3 = asFloat3(d_x); const int* d_atomIndices = nb->atomIndices; const int* d_cxy_na = &nb->cxy_na[numColumnsMax * gridId]; const int* d_cxy_ind = &nb->cxy_ind[numColumnsMax * gridId]; - const auto kernelArgs = - prepareGpuKernelArguments(kernelFn, config, &numColumns, &d_xq, &d_x, &d_atomIndices, - &d_cxy_na, &d_cxy_ind, &cellOffset, &numAtomsPerCell); + const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &numColumns, &d_xq, + &d_xFloat3, &d_atomIndices, &d_cxy_na, + &d_cxy_ind, &cellOffset, &numAtomsPerCell); launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs); } @@ -884,7 +885,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, * forces only after Local stream already done so. */ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality, - DeviceBuffer totalForcesDevice, + DeviceBuffer totalForcesDevice, NbnxmGpu* nb, void* pmeForcesDevice, gmx::ArrayRef dependencyList, @@ -935,8 +936,8 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLo } const float3* d_fNB = adat->f; - const float3* d_fPme = (float3*)pmeForcesDevice; - float3* d_fTotal = (float3*)totalForcesDevice; + const float3* d_fPme = static_cast(pmeForcesDevice); + float3* d_fTotal = asFloat3(totalForcesDevice); const int* d_cell = nb->cell; const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &d_fNB, &d_fPme, &d_fTotal, diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 07b24fc923..ab79a8cf5b 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -140,7 +140,7 @@ void nonbonded_verlet_t::convertCoordinates(const gmx::AtomLocality local void nonbonded_verlet_t::convertCoordinatesGpu(const gmx::AtomLocality locality, const bool fillLocal, - DeviceBuffer d_x, + DeviceBuffer d_x, GpuEventSynchronizer* xReadyOnDevice) { wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); @@ -178,7 +178,7 @@ void nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const gmx::AtomLocality local } void nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const gmx::AtomLocality locality, - DeviceBuffer totalForcesDevice, + DeviceBuffer totalForcesDevice, void* forcesPmeDevice, gmx::ArrayRef dependencyList, bool useGpuFPmeReduction, diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index 50fed0fe5b..2fa353a848 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -286,10 +286,10 @@ public: * \param[in] d_x GPU coordinates buffer in plain rvec format to be transformed. * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory. */ - void convertCoordinatesGpu(gmx::AtomLocality locality, - bool fillLocal, - DeviceBuffer d_x, - GpuEventSynchronizer* xReadyOnDevice); + void convertCoordinatesGpu(gmx::AtomLocality locality, + bool fillLocal, + DeviceBuffer d_x, + GpuEventSynchronizer* xReadyOnDevice); //! Init for GPU version of setup coordinates in Nbnxm void atomdata_init_copy_x_to_nbat_x_gpu(); @@ -349,7 +349,7 @@ public: * \param [in] accumulateForce If the total force buffer already contains data */ void atomdata_add_nbat_f_to_f_gpu(gmx::AtomLocality locality, - DeviceBuffer totalForcesDevice, + DeviceBuffer totalForcesDevice, void* forcesPmeDevice, gmx::ArrayRef dependencyList, bool useGpuFPmeReduction, diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index 7b9e4b80f9..2370fc836b 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -242,8 +242,8 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused& gridSet, CUDA_FUNC_QUALIFIER void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid, bool gmx_unused setFillerCoords, - NbnxmGpu gmx_unused* gpu_nbv, - DeviceBuffer gmx_unused d_x, + NbnxmGpu gmx_unused* gpu_nbv, + DeviceBuffer gmx_unused d_x, GpuEventSynchronizer gmx_unused* xReadyOnDevice, gmx::AtomLocality gmx_unused locality, int gmx_unused gridId, @@ -312,7 +312,7 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused* cell, */ CUDA_FUNC_QUALIFIER void nbnxn_gpu_add_nbat_f_to_f(gmx::AtomLocality gmx_unused atomLocality, - DeviceBuffer gmx_unused totalForcesDevice, + DeviceBuffer gmx_unused totalForcesDevice, NbnxmGpu gmx_unused* gpu_nbv, void gmx_unused* pmeForcesDevice, gmx::ArrayRef gmx_unused dependencyList, -- 2.22.0