/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* \param [in] d_coordinateBuffer pointer to coordinates buffer in GPU memory
* \param [in] d_forcesBuffer pointer to coordinates buffer in GPU memory
*/
- void reinitHalo(DeviceBuffer<float> d_coordinateBuffer, DeviceBuffer<float> d_forcesBuffer);
+ void reinitHalo(DeviceBuffer<RVec> d_coordinateBuffer, DeviceBuffer<RVec> d_forcesBuffer);
/*! \brief GPU halo exchange of coordinates buffer.
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
GpuHaloExchange::~GpuHaloExchange() = default;
/*!\brief init halo exhange stub. */
-void GpuHaloExchange::reinitHalo(DeviceBuffer<float> /* d_coordinatesBuffer */,
- DeviceBuffer<float> /* d_forcesBuffer */)
+void GpuHaloExchange::reinitHalo(DeviceBuffer<RVec> /* d_coordinatesBuffer */,
+ DeviceBuffer<RVec> /* d_forcesBuffer */)
{
GMX_ASSERT(false,
"A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "gromacs/gpu_utils/devicebuffer.h"
#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
#include "gromacs/gpu_utils/vectype_ops.cuh"
+#include "gromacs/math/vectypes.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/utility/gmxmpi.h"
GpuHaloExchange::~GpuHaloExchange() = default;
-void GpuHaloExchange::reinitHalo(DeviceBuffer<float> d_coordinatesBuffer, DeviceBuffer<float> d_forcesBuffer)
+void GpuHaloExchange::reinitHalo(DeviceBuffer<RVec> d_coordinatesBuffer, DeviceBuffer<RVec> d_forcesBuffer)
{
- impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer),
- reinterpret_cast<float3*>(d_forcesBuffer));
+ impl_->reinitHalo(asFloat3(d_coordinatesBuffer), asFloat3(d_forcesBuffer));
}
void GpuHaloExchange::communicateHaloCoordinates(const matrix box,
* \param[in] pme The PME data structure.
* \param[in] d_x The pointer to the positions buffer to be set
*/
-GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
- DeviceBuffer<float> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
+ DeviceBuffer<gmx::RVec> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
/*! \brief Get pointer to device copy of force data.
* \param[in] pme The PME data structure.
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
//! Controls if the atom and charge data is prefeched into shared memory or loaded per thread from global
static const bool c_useAtomDataPrefetch = true;
+/*! \brief Asserts if the argument is finite.
+ *
+ * The function works for any data type, that can be casted to float. Note that there is also
+ * a specialized implementation for float3 data type.
+ *
+ * \param[in] arg Argument to check.
+ */
+template<typename T>
+__device__ inline void assertIsFinite(T arg);
+
+template<>
+__device__ inline void assertIsFinite(float3 arg)
+{
+ assert(isfinite(float(arg.x)));
+ assert(isfinite(float(arg.y)));
+ assert(isfinite(float(arg.z)));
+}
+
+template<typename T>
+__device__ inline void assertIsFinite(T arg)
+{
+ assert(isfinite(float(arg)));
+}
+
/*! \brief
* General purpose function for loading atom-related data from global to shared memory.
*
pme_gpu_check_atom_data_index(globalIndex, kernelParams.atoms.nAtoms * dataCountPerAtom);
if ((localIndex < atomsPerBlock * dataCountPerAtom) & globalCheck)
{
- assert(isfinite(float(gm_source[globalIndex])));
+ assertIsFinite(gm_source[globalIndex]);
sm_destination[localIndex] = gm_source[globalIndex];
}
}
#define GMX_PMECOORDINATERECEIVERGPU_H
#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/math/vectypes.h"
#include "gromacs/utility/classhelpers.h"
#include "gromacs/utility/gmxmpi.h"
* send coordinates buffer address to PP rank
* \param[in] d_x coordinates buffer in GPU memory
*/
- void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> d_x);
+ void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x);
/*! \brief
PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default;
/*!\brief init PME-PP GPU communication stub */
-void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> /* d_x */)
+void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> /* d_x */)
{
GMX_ASSERT(false,
"A CPU stub for PME-PP GPU communication initialization was called instead of the "
PmeCoordinateReceiverGpu::Impl::~Impl() = default;
-void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> d_x)
+void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x)
{
int ind_start = 0;
ind_end = ind_start + receiver.numAtoms;
// Data will be transferred directly from GPU.
- void* sendBuf = reinterpret_cast<void*>(&d_x[ind_start * DIM]);
+ void* sendBuf = reinterpret_cast<void*>(&d_x[ind_start]);
#if GMX_MPI
MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default;
-void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> d_x)
+void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x)
{
impl_->sendCoordinateBufferAddressToPpRanks(d_x);
}
* send coordinates buffer address to PP rank
* \param[in] d_x coordinates buffer in GPU memory
*/
- void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<float> d_x);
+ void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x);
/*! \brief
* launch receive of coordinate data from PP rank
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include <cassert>
#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+#include "gromacs/gpu_utils/cudautils.cuh"
#include "pme.cuh"
#include "pme_calculate_splines.cuh"
}
else
{
+ const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
/* Recaclulate Splines */
if (c_useAtomDataPrefetch)
{
// charges
__shared__ float sm_coefficients[atomsPerBlock];
// Coordinates
- __shared__ float sm_coordinates[DIM * atomsPerBlock];
+ __shared__ float3 sm_coordinates[atomsPerBlock];
/* Staging coefficients/charges */
- pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients,
- kernelParams.atoms.d_coefficients);
+ pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(kernelParams, sm_coefficients, gm_coefficients);
/* Staging coordinates */
- pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates,
- kernelParams.atoms.d_coordinates);
+ pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(kernelParams, sm_coordinates, gm_coordinates);
__syncthreads();
- atomX.x = sm_coordinates[atomIndexLocal * DIM + XX];
- atomX.y = sm_coordinates[atomIndexLocal * DIM + YY];
- atomX.z = sm_coordinates[atomIndexLocal * DIM + ZZ];
+ atomX = sm_coordinates[atomIndexLocal];
atomCharge = sm_coefficients[atomIndexLocal];
}
else
{
+ atomX = gm_coordinates[atomIndexGlobal];
atomCharge = gm_coefficients[atomIndexGlobal];
- atomX.x = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + XX];
- atomX.y = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + YY];
- atomX.z = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + ZZ];
}
calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false>(
kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices);
return pme_gpu_get_kernelparam_forces(pme->gpu);
}
-void pme_gpu_set_device_x(const gmx_pme_t* pme, DeviceBuffer<float> d_x)
+void pme_gpu_set_device_x(const gmx_pme_t* pme, DeviceBuffer<gmx::RVec> d_x)
{
GMX_ASSERT(pme != nullptr, "Null pointer is passed as a PME to the set coordinates function.");
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
}
}
-void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<float> d_x)
+void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<gmx::RVec> d_x)
{
GMX_ASSERT(pmeGpu && pmeGpu->kernelParams,
"PME GPU device buffer can not be set in non-GPU builds or before the GPU PME was "
* \param[in] d_x Pointer to coordinate data
*/
GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
- DeviceBuffer<float> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
+ DeviceBuffer<gmx::RVec> GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
/*! \brief Return pointer to device copy of force data.
* \param[in] pmeGpu The PME GPU structure.
* The coordinates themselves change and need to be copied to the GPU for every PME computation,
* but reallocation happens only at DD.
*/
- HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_coordinates;
+ HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<gmx::RVec>) d_coordinates;
/*! \brief Global GPU memory array handle with input atom charges.
* The charges only need to be reallocated and copied to the GPU at DD step.
*/
*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013-2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2013-2016,2017,2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include <cassert>
#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+#include "gromacs/gpu_utils/cudautils.cuh"
#include "pme.cuh"
#include "pme_calculate_splines.cuh"
if (computeSplines)
{
+ const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
if (c_useAtomDataPrefetch)
{
// Coordinates
- __shared__ float sm_coordinates[DIM * atomsPerBlock];
+ __shared__ float3 sm_coordinates[atomsPerBlock];
/* Staging coordinates */
- pme_gpu_stage_atom_data<float, atomsPerBlock, DIM>(kernelParams, sm_coordinates,
- kernelParams.atoms.d_coordinates);
+ pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(kernelParams, sm_coordinates, gm_coordinates);
__syncthreads();
- atomX.x = sm_coordinates[atomIndexLocal * DIM + XX];
- atomX.y = sm_coordinates[atomIndexLocal * DIM + YY];
- atomX.z = sm_coordinates[atomIndexLocal * DIM + ZZ];
+ atomX = sm_coordinates[atomIndexLocal];
}
else
{
- atomX.x = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + XX];
- atomX.y = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + YY];
- atomX.z = kernelParams.atoms.d_coordinates[atomIndexGlobal * DIM + ZZ];
+ atomX = gm_coordinates[atomIndexGlobal];
}
calculate_splines<order, atomsPerBlock, atomsPerWarp, false, writeGlobal>(
kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, &dtheta, sm_gridlineIndices);
rvec tmp = { b.x, b.y, b.z };
rvec_inc(a, tmp);
}
+/*! \brief Cast RVec buffer to float3 buffer.
+ *
+ * \param[in] in The RVec buffer to cast.
+ *
+ * \returns Buffer, casted to float3*.
+ */
+static inline __host__ __device__ float3* asFloat3(gmx::RVec* in)
+{
+ static_assert(sizeof(in[0]) == sizeof(float3),
+ "Size of the host-side data-type is different from the size of the device-side "
+ "counterpart.");
+ return reinterpret_cast<float3*>(in);
+}
/*! \brief Wait for all taks in stream \p s to complete.
*
GMX_ASSERT(buffer, "needs a buffer pointer");
const size_t offset = startingOffset * sizeof(ValueType);
const size_t bytes = numValues * sizeof(ValueType);
- const ValueType pattern = 0;
+ const int pattern = 0;
const cl_uint numWaitEvents = 0;
const cl_event* waitEvents = nullptr;
cl_event commandEvent;
#
# This file is part of the GROMACS molecular simulation package.
#
-# Copyright (c) 2017,2018,2019, by the GROMACS development team, led by
+# Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
# CUDA-only test
list(APPEND SOURCES_FROM_CXX
pinnedmemorychecker.cpp
+ gpu_utils.cpp
)
# TODO Making a separate library is heavy handed, but nothing else
# seems to work. Also don't use a hyphen in its name, because nvcc
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Tests for CUDA float3 type layout.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include <vector>
+
+#ifndef __CUDA_ARCH__
+/*! \brief Dummy definition to avoid compiler error
+ *
+ * \todo Find a better solution. Probably, move asFloat3(...) function to different header.
+ */
+# define __CUDA_ARCH__ -1
+# include <cuda_runtime.h>
+# undef __CUDA_ARCH__
+#else
+# include <cuda_runtime.h>
+#endif
+#include <gtest/gtest.h>
+
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/utility/real.h"
+
+#if GMX_GPU == GMX_GPU_CUDA
+
+namespace gmx
+{
+
+namespace test
+{
+
+TEST(GpuDataTypesCompatibilityTest, RVecAndFloat3)
+{
+ std::vector<RVec> dataRVec;
+ dataRVec.emplace_back(1.0, 2.0, 3.0);
+ dataRVec.emplace_back(4.0, 5.0, 6.0);
+ float3* dataFloat3 = asFloat3(dataRVec.data());
+ EXPECT_EQ(dataFloat3[0].x, dataRVec[0][XX]);
+ EXPECT_EQ(dataFloat3[0].y, dataRVec[0][YY]);
+ EXPECT_EQ(dataFloat3[0].z, dataRVec[0][ZZ]);
+ EXPECT_EQ(dataFloat3[1].x, dataRVec[1][XX]);
+ EXPECT_EQ(dataFloat3[1].y, dataRVec[1][YY]);
+ EXPECT_EQ(dataFloat3[1].z, dataRVec[1][ZZ]);
+}
+
+} // namespace test
+} // namespace gmx
+
+#endif // GMX_GPU == GMX_GPU_CUDA
\ No newline at end of file
* \param[in] md Atoms data.
* \param[in] numTempScaleValues Number of temperature scaling groups. Zero for no temperature scaling.
*/
- void set(DeviceBuffer<float> d_x,
- DeviceBuffer<float> d_v,
- DeviceBuffer<float> d_f,
- const t_idef& idef,
- const t_mdatoms& md,
- int numTempScaleValues);
+ void set(DeviceBuffer<RVec> d_x,
+ DeviceBuffer<RVec> d_v,
+ DeviceBuffer<RVec> d_f,
+ const t_idef& idef,
+ const t_mdatoms& md,
+ int numTempScaleValues);
/*! \brief
* Update PBC data.
"A CPU stub for UpdateConstrain was called instead of the correct implementation.");
}
-void UpdateConstrainGpu::set(DeviceBuffer<float> /* d_x */,
- DeviceBuffer<float> /* d_v */,
- const DeviceBuffer<float> /* d_f */,
+void UpdateConstrainGpu::set(DeviceBuffer<RVec> /* d_x */,
+ DeviceBuffer<RVec> /* d_v */,
+ const DeviceBuffer<RVec> /* d_f */,
const t_idef& /* idef */,
const t_mdatoms& /* md */,
const int /* numTempScaleValues */)
UpdateConstrainGpu::Impl::~Impl() {}
-void UpdateConstrainGpu::Impl::set(DeviceBuffer<float> d_x,
- DeviceBuffer<float> d_v,
- const DeviceBuffer<float> d_f,
- const t_idef& idef,
- const t_mdatoms& md,
- const int numTempScaleValues)
+void UpdateConstrainGpu::Impl::set(DeviceBuffer<RVec> d_x,
+ DeviceBuffer<RVec> d_v,
+ const DeviceBuffer<RVec> d_f,
+ const t_idef& idef,
+ const t_mdatoms& md,
+ const int numTempScaleValues)
{
GMX_ASSERT(d_x != nullptr, "Coordinates device buffer should not be null.");
GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null.");
impl_->scaleCoordinates(scalingMatrix);
}
-void UpdateConstrainGpu::set(DeviceBuffer<float> d_x,
- DeviceBuffer<float> d_v,
- const DeviceBuffer<float> d_f,
- const t_idef& idef,
- const t_mdatoms& md,
- const int numTempScaleValues)
+void UpdateConstrainGpu::set(DeviceBuffer<RVec> d_x,
+ DeviceBuffer<RVec> d_v,
+ const DeviceBuffer<RVec> d_f,
+ const t_idef& idef,
+ const t_mdatoms& md,
+ const int numTempScaleValues)
{
impl_->set(d_x, d_v, d_f, idef, md, numTempScaleValues);
}
* \param[in] md Atoms data.
* \param[in] numTempScaleValues Number of temperature scaling groups. Set zero for no temperature coupling.
*/
- void set(DeviceBuffer<float> d_x,
- DeviceBuffer<float> d_v,
- const DeviceBuffer<float> d_f,
- const t_idef& idef,
- const t_mdatoms& md,
- const int numTempScaleValues);
+ void set(DeviceBuffer<RVec> d_x,
+ DeviceBuffer<RVec> d_v,
+ const DeviceBuffer<RVec> d_f,
+ const t_idef& idef,
+ const t_mdatoms& md,
+ const int numTempScaleValues);
/*! \brief
* Update PBC data.
*
* \returns GPU positions buffer.
*/
- DeviceBuffer<float> getCoordinates();
+ DeviceBuffer<RVec> getCoordinates();
/*! \brief Copy positions to the GPU memory.
*
*
* \returns GPU velocities buffer.
*/
- DeviceBuffer<float> getVelocities();
+ DeviceBuffer<RVec> getVelocities();
/*! \brief Copy velocities to the GPU memory.
*
*
* \returns GPU force buffer.
*/
- DeviceBuffer<float> getForces();
+ DeviceBuffer<RVec> getForces();
/*! \brief Copy forces to the GPU memory.
*
return std::make_tuple(0, 0);
}
-DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getCoordinates()
{
GMX_ASSERT(false,
"A CPU stub method from GPU state propagator data was called instead of one from "
"GPU implementation.");
- return DeviceBuffer<float>{};
+ return {};
}
GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(
}
-DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getVelocities()
{
GMX_ASSERT(false,
"A CPU stub method from GPU state propagator data was called instead of one from "
"GPU implementation.");
- return DeviceBuffer<float>{};
+ return {};
}
void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec> /* h_v */,
}
-DeviceBuffer<float> StatePropagatorDataGpu::getForces()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getForces()
{
GMX_ASSERT(false,
"A CPU stub method from GPU state propagator data was called instead of one from "
"GPU implementation.");
- return DeviceBuffer<float>{};
+ return {};
}
void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> /* h_f */,
*
* \returns GPU positions buffer.
*/
- DeviceBuffer<float> getCoordinates();
+ DeviceBuffer<RVec> getCoordinates();
/*! \brief Copy positions to the GPU memory.
*
*
* \returns GPU velocities buffer.
*/
- DeviceBuffer<float> getVelocities();
+ DeviceBuffer<RVec> getVelocities();
/*! \brief Copy velocities to the GPU memory.
*
*
* \returns GPU force buffer.
*/
- DeviceBuffer<float> getForces();
+ DeviceBuffer<RVec> getForces();
/*! \brief Copy forces to the GPU memory.
*
int numAtomsAll_ = -1;
//! Device positions buffer
- DeviceBuffer<float> d_x_;
+ DeviceBuffer<RVec> d_x_;
//! Number of particles saved in the positions buffer
int d_xSize_ = -1;
//! Allocation size for the positions buffer
int d_xCapacity_ = -1;
//! Device velocities buffer
- DeviceBuffer<float> d_v_;
+ DeviceBuffer<RVec> d_v_;
//! Number of particles saved in the velocities buffer
int d_vSize_ = -1;
//! Allocation size for the velocities buffer
int d_vCapacity_ = -1;
//! Device force buffer
- DeviceBuffer<float> d_f_;
+ DeviceBuffer<RVec> d_f_;
//! Number of particles saved in the force buffer
int d_fSize_ = -1;
//! Allocation size for the force buffer
* \param[in] atomLocality If all, local or non-local ranges should be copied.
* \param[in] commandStream GPU stream to execute copy in.
*/
- void copyToDevice(DeviceBuffer<float> d_data,
+ void copyToDevice(DeviceBuffer<RVec> d_data,
gmx::ArrayRef<const gmx::RVec> h_data,
int dataSize,
AtomLocality atomLocality,
* \param[in] commandStream GPU stream to execute copy in.
*/
void copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
- DeviceBuffer<float> d_data,
+ DeviceBuffer<RVec> d_data,
int dataSize,
AtomLocality atomLocality,
CommandStream commandStream);
numAtomsPadded = numAtomsAll_;
}
- reallocateDeviceBuffer(&d_x_, DIM * numAtomsPadded, &d_xSize_, &d_xCapacity_, deviceContext_);
+ reallocateDeviceBuffer(&d_x_, numAtomsPadded, &d_xSize_, &d_xCapacity_, deviceContext_);
const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_;
if (paddingAllocationSize > 0)
{
// The PME stream is used here because the padding region of d_x_ is only in the PME task.
- clearDeviceBufferAsync(&d_x_, DIM * numAtomsAll_, DIM * paddingAllocationSize, pmeStream_);
+ clearDeviceBufferAsync(&d_x_, numAtomsAll_, paddingAllocationSize, pmeStream_);
}
- reallocateDeviceBuffer(&d_v_, DIM * numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
+ reallocateDeviceBuffer(&d_v_, numAtomsAll_, &d_vSize_, &d_vCapacity_, deviceContext_);
const int d_fOldCapacity = d_fCapacity_;
- reallocateDeviceBuffer(&d_f_, DIM * numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
+ reallocateDeviceBuffer(&d_f_, numAtomsAll_, &d_fSize_, &d_fCapacity_, deviceContext_);
// Clearing of the forces can be done in local stream since the nonlocal stream cannot reach
// the force accumulation stage before syncing with the local stream. Only done in CUDA,
// since the force buffer ops are not implemented in OpenCL.
return std::make_tuple(atomsStartAt, numAtomsToCopy);
}
-void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<float> d_data,
+void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<RVec> d_data,
const gmx::ArrayRef<const gmx::RVec> h_data,
int dataSize,
AtomLocality atomLocality,
int atomsStartAt, numAtomsToCopy;
std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
- int elementsStartAt = atomsStartAt * DIM;
- int numElementsToCopy = numAtomsToCopy * DIM;
-
if (numAtomsToCopy != 0)
{
- GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize,
+ GMX_ASSERT(atomsStartAt + numAtomsToCopy <= dataSize,
"The device allocation is smaller than requested copy range.");
GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(),
"The host buffer is smaller than the requested copy range.");
- copyToDeviceBuffer(&d_data, reinterpret_cast<const float*>(&h_data.data()[atomsStartAt]),
- elementsStartAt, numElementsToCopy, commandStream, transferKind_, nullptr);
+ copyToDeviceBuffer(&d_data, reinterpret_cast<const RVec*>(&h_data.data()[atomsStartAt]),
+ atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr);
}
wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
}
void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
- DeviceBuffer<float> d_data,
+ DeviceBuffer<RVec> d_data,
int dataSize,
AtomLocality atomLocality,
CommandStream commandStream)
int atomsStartAt, numAtomsToCopy;
std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
- int elementsStartAt = atomsStartAt * DIM;
- int numElementsToCopy = numAtomsToCopy * DIM;
-
if (numAtomsToCopy != 0)
{
- GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize,
+ GMX_ASSERT(atomsStartAt + numAtomsToCopy <= dataSize,
"The device allocation is smaller than requested copy range.");
GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(),
"The host buffer is smaller than the requested copy range.");
- copyFromDeviceBuffer(reinterpret_cast<float*>(&h_data.data()[atomsStartAt]), &d_data,
- elementsStartAt, numElementsToCopy, commandStream, transferKind_, nullptr);
+ copyFromDeviceBuffer(reinterpret_cast<RVec*>(&h_data.data()[atomsStartAt]), &d_data,
+ atomsStartAt, numAtomsToCopy, commandStream, transferKind_, nullptr);
}
wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
-DeviceBuffer<float> StatePropagatorDataGpu::Impl::getCoordinates()
+DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getCoordinates()
{
return d_x_;
}
}
-DeviceBuffer<float> StatePropagatorDataGpu::Impl::getVelocities()
+DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getVelocities()
{
return d_v_;
}
}
-DeviceBuffer<float> StatePropagatorDataGpu::Impl::getForces()
+DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getForces()
{
return d_f_;
}
}
-DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getCoordinates()
{
return impl_->getCoordinates();
}
}
-DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getVelocities()
{
return impl_->getVelocities();
}
}
-DeviceBuffer<float> StatePropagatorDataGpu::getForces()
+DeviceBuffer<RVec> StatePropagatorDataGpu::getForces()
{
return impl_->getForces();
}
const gmx::AtomLocality locality,
bool fillLocal,
NbnxmGpu* gpu_nbv,
- DeviceBuffer<float> d_x,
+ DeviceBuffer<RVec> d_x,
GpuEventSynchronizer* xReadyOnDevice)
{
/* Add the force array(s) from nbnxn_atomdata_t to f */
void reduceForcesGpu(const gmx::AtomLocality locality,
- DeviceBuffer<float> totalForcesDevice,
+ DeviceBuffer<RVec> totalForcesDevice,
const Nbnxm::GridSet& gridSet,
void* pmeForcesDevice,
gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
* \param[in] d_x Coordinates to be copied (in plain rvec format).
* \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
*/
-void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet& gridSet,
- gmx::AtomLocality locality,
- bool fillLocal,
- NbnxmGpu* gpu_nbv,
- DeviceBuffer<float> d_x,
- GpuEventSynchronizer* xReadyOnDevice);
+void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet& gridSet,
+ gmx::AtomLocality locality,
+ bool fillLocal,
+ NbnxmGpu* gpu_nbv,
+ DeviceBuffer<gmx::RVec> d_x,
+ GpuEventSynchronizer* xReadyOnDevice);
/*! \brief Add the computed forces to \p f, an internal reduction might be performed as well
*
* \param[in] accumulateForce Whether there are usefull data already in the total force buffer.
*/
void reduceForcesGpu(gmx::AtomLocality locality,
- DeviceBuffer<float> totalForcesDevice,
+ DeviceBuffer<gmx::RVec> totalForcesDevice,
const Nbnxm::GridSet& gridSet,
void* pmeForcesDevice,
gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid,
bool setFillerCoords,
NbnxmGpu* nb,
- DeviceBuffer<float> d_x,
+ DeviceBuffer<gmx::RVec> d_x,
GpuEventSynchronizer* xReadyOnDevice,
const Nbnxm::AtomLocality locality,
int gridId,
auto kernelFn = setFillerCoords ? nbnxn_gpu_x_to_nbat_x_kernel<true>
: nbnxn_gpu_x_to_nbat_x_kernel<false>;
float4* d_xq = adat->xq;
+ float3* d_xFloat3 = asFloat3(d_x);
const int* d_atomIndices = nb->atomIndices;
const int* d_cxy_na = &nb->cxy_na[numColumnsMax * gridId];
const int* d_cxy_ind = &nb->cxy_ind[numColumnsMax * gridId];
- const auto kernelArgs =
- prepareGpuKernelArguments(kernelFn, config, &numColumns, &d_xq, &d_x, &d_atomIndices,
- &d_cxy_na, &d_cxy_ind, &cellOffset, &numAtomsPerCell);
+ const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &numColumns, &d_xq,
+ &d_xFloat3, &d_atomIndices, &d_cxy_na,
+ &d_cxy_ind, &cellOffset, &numAtomsPerCell);
launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs);
}
* forces only after Local stream already done so.
*/
void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality,
- DeviceBuffer<float> totalForcesDevice,
+ DeviceBuffer<gmx::RVec> totalForcesDevice,
NbnxmGpu* nb,
void* pmeForcesDevice,
gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
}
const float3* d_fNB = adat->f;
- const float3* d_fPme = (float3*)pmeForcesDevice;
- float3* d_fTotal = (float3*)totalForcesDevice;
+ const float3* d_fPme = static_cast<float3*>(pmeForcesDevice);
+ float3* d_fTotal = asFloat3(totalForcesDevice);
const int* d_cell = nb->cell;
const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &d_fNB, &d_fPme, &d_fTotal,
void nonbonded_verlet_t::convertCoordinatesGpu(const gmx::AtomLocality locality,
const bool fillLocal,
- DeviceBuffer<float> d_x,
+ DeviceBuffer<gmx::RVec> d_x,
GpuEventSynchronizer* xReadyOnDevice)
{
wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
}
void nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const gmx::AtomLocality locality,
- DeviceBuffer<float> totalForcesDevice,
+ DeviceBuffer<gmx::RVec> totalForcesDevice,
void* forcesPmeDevice,
gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
bool useGpuFPmeReduction,
* \param[in] d_x GPU coordinates buffer in plain rvec format to be transformed.
* \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
*/
- void convertCoordinatesGpu(gmx::AtomLocality locality,
- bool fillLocal,
- DeviceBuffer<float> d_x,
- GpuEventSynchronizer* xReadyOnDevice);
+ void convertCoordinatesGpu(gmx::AtomLocality locality,
+ bool fillLocal,
+ DeviceBuffer<gmx::RVec> d_x,
+ GpuEventSynchronizer* xReadyOnDevice);
//! Init for GPU version of setup coordinates in Nbnxm
void atomdata_init_copy_x_to_nbat_x_gpu();
* \param [in] accumulateForce If the total force buffer already contains data
*/
void atomdata_add_nbat_f_to_f_gpu(gmx::AtomLocality locality,
- DeviceBuffer<float> totalForcesDevice,
+ DeviceBuffer<gmx::RVec> totalForcesDevice,
void* forcesPmeDevice,
gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
bool useGpuFPmeReduction,
CUDA_FUNC_QUALIFIER
void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid gmx_unused& grid,
bool gmx_unused setFillerCoords,
- NbnxmGpu gmx_unused* gpu_nbv,
- DeviceBuffer<float> gmx_unused d_x,
+ NbnxmGpu gmx_unused* gpu_nbv,
+ DeviceBuffer<gmx::RVec> gmx_unused d_x,
GpuEventSynchronizer gmx_unused* xReadyOnDevice,
gmx::AtomLocality gmx_unused locality,
int gmx_unused gridId,
*/
CUDA_FUNC_QUALIFIER
void nbnxn_gpu_add_nbat_f_to_f(gmx::AtomLocality gmx_unused atomLocality,
- DeviceBuffer<float> gmx_unused totalForcesDevice,
+ DeviceBuffer<gmx::RVec> gmx_unused totalForcesDevice,
NbnxmGpu gmx_unused* gpu_nbv,
void gmx_unused* pmeForcesDevice,
gmx::ArrayRef<GpuEventSynchronizer* const> gmx_unused dependencyList,