From aac166c9ccb2c386c6017cd56664ad43f759b0b5 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Tue, 11 May 2021 11:25:26 +0000 Subject: [PATCH] Make common Update-Constraints code backend-agnostic --- src/gromacs/mdlib/CMakeLists.txt | 12 +- .../mdlib/update_constrain_gpu_impl.cpp | 262 +++++++++++--- .../mdlib/update_constrain_gpu_impl.cu | 339 ------------------ src/gromacs/mdlib/update_constrain_gpu_impl.h | 13 - .../mdlib/update_constrain_gpu_impl_stubs.cpp | 131 +++++++ .../mdlib/update_constrain_gpu_internal.cu | 105 ++++++ .../mdlib/update_constrain_gpu_internal.h | 84 +++++ .../update_constrain_gpu_internal_sycl.cpp | 61 ++++ 8 files changed, 600 insertions(+), 407 deletions(-) delete mode 100644 src/gromacs/mdlib/update_constrain_gpu_impl.cu create mode 100644 src/gromacs/mdlib/update_constrain_gpu_impl_stubs.cpp create mode 100644 src/gromacs/mdlib/update_constrain_gpu_internal.cu create mode 100644 src/gromacs/mdlib/update_constrain_gpu_internal.h create mode 100644 src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt index 511a946809..2e645f0362 100644 --- a/src/gromacs/mdlib/CMakeLists.txt +++ b/src/gromacs/mdlib/CMakeLists.txt @@ -44,7 +44,9 @@ list(REMOVE_ITEM MDLIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu.cpp ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu_internal_sycl.cpp ${CMAKE_CURRENT_SOURCE_DIR}/settle_gpu.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/settle_gpu_internal_sycl.cpp) + ${CMAKE_CURRENT_SOURCE_DIR}/settle_gpu_internal_sycl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/update_constrain_gpu_impl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/update_constrain_gpu_internal_sycl.cpp) set(MDLIB_SOURCES ${MDLIB_SOURCES} PARENT_SCOPE) if(GMX_GPU_CUDA) @@ -56,12 +58,14 @@ if(GMX_GPU_CUDA) lincs_gpu_internal.cu settle_gpu.cpp settle_gpu_internal.cu - update_constrain_gpu_impl.cu + update_constrain_gpu_impl.cpp + update_constrain_gpu_internal.cu ) _gmx_add_files_to_property(CUDA_SOURCES gpuforcereduction_impl.cpp lincs_gpu.cpp settle_gpu.cpp + update_constrain_gpu_impl.cpp ) endif() @@ -74,6 +78,8 @@ if(GMX_GPU_SYCL) lincs_gpu_internal_sycl.cpp settle_gpu.cpp settle_gpu_internal_sycl.cpp + update_constrain_gpu_impl.cpp + update_constrain_gpu_internal_sycl.cpp ) _gmx_add_files_to_property(SYCL_SOURCES @@ -84,6 +90,8 @@ if(GMX_GPU_SYCL) lincs_gpu_internal_sycl.cpp settle_gpu.cpp settle_gpu_internal_sycl.cpp + update_constrain_gpu_impl.cpp + update_constrain_gpu_internal_sycl.cpp ) endif() diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp index b0f3872860..df6420d378 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp @@ -34,7 +34,12 @@ */ /*! \internal \file * - * \brief Stub for update and constraints class CPU implementation. + * \brief Implements update and constraints class. + * + * The class combines Leap-Frog integrator with LINCS and SETTLE constraints. + * + * \todo The computational procedures in members should be integrated to improve + * computational performance. * * \author Artem Zhmurov * @@ -42,90 +47,241 @@ */ #include "gmxpre.h" -#include "config.h" +#include "update_constrain_gpu_impl.h" -#include "gromacs/mdlib/update_constrain_gpu.h" -#include "gromacs/utility/gmxassert.h" +#include +#include + +#include -#if !GMX_GPU_CUDA +#include + +#include "gromacs/gpu_utils/device_context.h" +#include "gromacs/gpu_utils/device_stream.h" +#include "gromacs/gpu_utils/devicebuffer.h" +#if GMX_GPU_CUDA +# include "gromacs/gpu_utils/gpueventsynchronizer.cuh" +#elif GMX_GPU_SYCL +# include "gromacs/gpu_utils/gpueventsynchronizer_sycl.h" +#endif +#include "gromacs/gpu_utils/gputraits.h" +#include "gromacs/mdlib/leapfrog_gpu.h" +#include "gromacs/mdlib/update_constrain_gpu.h" +#include "gromacs/mdlib/update_constrain_gpu_internal.h" +#include "gromacs/mdtypes/mdatom.h" +#include "gromacs/timing/wallcycle.h" namespace gmx { -class UpdateConstrainGpu::Impl +void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer* fReadyOnDevice, + const real dt, + const bool updateVelocities, + const bool computeVirial, + tensor virial, + const bool doTemperatureScaling, + gmx::ArrayRef tcstat, + const bool doParrinelloRahman, + const float dtPressureCouple, + const matrix prVelocityScalingMatrix) { -}; + wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); + wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); + + // Clearing virial matrix + // TODO There is no point in having separate virial matrix for constraints + clear_mat(virial); -UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec& /* ir */, - const gmx_mtop_t& /* mtop */, - const int /* numTempScaleValues */, - const DeviceContext& /* deviceContext */, - const DeviceStream& /* deviceStream */, - GpuEventSynchronizer* /* xUpdatedOnDevice */, - gmx_wallcycle* /*wcycle*/) : - impl_(nullptr) + // Make sure that the forces are ready on device before proceeding with the update. + fReadyOnDevice->enqueueWaitEvent(deviceStream_); + + // The integrate should save a copy of the current coordinates in d_xp_ and write updated + // once into d_x_. The d_xp_ is only needed by constraints. + integrator_->integrate( + d_x_, d_xp_, d_v_, d_f_, dt, doTemperatureScaling, tcstat, doParrinelloRahman, dtPressureCouple, prVelocityScalingMatrix); + // Constraints need both coordinates before (d_x_) and after (d_xp_) update. However, after constraints + // are applied, the d_x_ can be discarded. So we intentionally swap the d_x_ and d_xp_ here to avoid the + // d_xp_ -> d_x_ copy after constraints. Note that the integrate saves them in the wrong order as well. + lincsGpu_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial, pbcAiuc_); + settleGpu_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial, pbcAiuc_); + + // scaledVirial -> virial (methods above returns scaled values) + float scaleFactor = 0.5F / (dt * dt); + for (int i = 0; i < DIM; i++) + { + for (int j = 0; j < DIM; j++) + { + virial[i][j] = scaleFactor * virial[i][j]; + } + } + + coordinatesReady_->markEvent(deviceStream_); + + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); + wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); + + return; +} + +void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix) +{ + wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); + wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); + + ScalingMatrix mu(scalingMatrix); + + launchScaleCoordinatesKernel(numAtoms_, d_x_, mu, deviceStream_); + + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); + wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); +} + +void UpdateConstrainGpu::Impl::scaleVelocities(const matrix scalingMatrix) +{ + wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); + wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); + + ScalingMatrix mu(scalingMatrix); + + launchScaleCoordinatesKernel(numAtoms_, d_v_, mu, deviceStream_); + + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); + wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); +} + +UpdateConstrainGpu::Impl::Impl(const t_inputrec& ir, + const gmx_mtop_t& mtop, + const int numTempScaleValues, + const DeviceContext& deviceContext, + const DeviceStream& deviceStream, + GpuEventSynchronizer* xUpdatedOnDevice, + gmx_wallcycle* wcycle) : + deviceContext_(deviceContext), + deviceStream_(deviceStream), + coordinatesReady_(xUpdatedOnDevice), + wcycle_(wcycle) +{ + GMX_ASSERT(xUpdatedOnDevice != nullptr, "The event synchronizer can not be nullptr."); + + integrator_ = std::make_unique(deviceContext_, deviceStream_, numTempScaleValues); + lincsGpu_ = std::make_unique(ir.nLincsIter, ir.nProjOrder, deviceContext_, deviceStream_); + settleGpu_ = std::make_unique(mtop, deviceContext_, deviceStream_); +} + +UpdateConstrainGpu::Impl::~Impl() {} + +void UpdateConstrainGpu::Impl::set(DeviceBuffer d_x, + DeviceBuffer d_v, + const DeviceBuffer d_f, + const InteractionDefinitions& idef, + const t_mdatoms& md) +{ + wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); + wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); + + GMX_ASSERT(d_x, "Coordinates device buffer should not be null."); + GMX_ASSERT(d_v, "Velocities device buffer should not be null."); + GMX_ASSERT(d_f, "Forces device buffer should not be null."); + + d_x_ = d_x; + d_v_ = d_v; + d_f_ = d_f; + + numAtoms_ = md.nr; + + reallocateDeviceBuffer(&d_xp_, numAtoms_, &numXp_, &numXpAlloc_, deviceContext_); + + reallocateDeviceBuffer( + &d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_); + + // Integrator should also update something, but it does not even have a method yet + integrator_->set(numAtoms_, md.invmass, md.cTC); + lincsGpu_->set(idef, numAtoms_, md.invmass); + settleGpu_->set(idef); + + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); + wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); +} + +void UpdateConstrainGpu::Impl::setPbc(const PbcType pbcType, const matrix box) +{ + // TODO wallcycle + setPbcAiuc(numPbcDimensions(pbcType), box, &pbcAiuc_); +} + +GpuEventSynchronizer* UpdateConstrainGpu::Impl::getCoordinatesReadySync() +{ + return coordinatesReady_; +} + +UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec& ir, + const gmx_mtop_t& mtop, + const int numTempScaleValues, + const DeviceContext& deviceContext, + const DeviceStream& deviceStream, + GpuEventSynchronizer* xUpdatedOnDevice, + gmx_wallcycle* wcycle) : + impl_(new Impl(ir, mtop, numTempScaleValues, deviceContext, deviceStream, xUpdatedOnDevice, wcycle)) { - GMX_ASSERT(!impl_, - "A CPU stub for UpdateConstrain was called instead of the correct implementation."); } UpdateConstrainGpu::~UpdateConstrainGpu() = default; -void UpdateConstrainGpu::integrate(GpuEventSynchronizer* /* fReadyOnDevice */, - const real /* dt */, - const bool /* updateVelocities */, - const bool /* computeVirial */, - tensor /* virialScaled */, - const bool /* doTemperatureScaling */, - gmx::ArrayRef /* tcstat */, - const bool /* doParrinelloRahman */, - const float /* dtPressureCouple */, - const matrix /* prVelocityScalingMatrix*/) -{ - GMX_ASSERT(!impl_, - "A CPU stub for UpdateConstrain was called instead of the correct implementation."); +void UpdateConstrainGpu::integrate(GpuEventSynchronizer* fReadyOnDevice, + const real dt, + const bool updateVelocities, + const bool computeVirial, + tensor virialScaled, + const bool doTemperatureScaling, + gmx::ArrayRef tcstat, + const bool doParrinelloRahman, + const float dtPressureCouple, + const matrix prVelocityScalingMatrix) +{ + impl_->integrate(fReadyOnDevice, + dt, + updateVelocities, + computeVirial, + virialScaled, + doTemperatureScaling, + tcstat, + doParrinelloRahman, + dtPressureCouple, + prVelocityScalingMatrix); } -void UpdateConstrainGpu::scaleCoordinates(const matrix /* scalingMatrix */) +void UpdateConstrainGpu::scaleCoordinates(const matrix scalingMatrix) { - GMX_ASSERT(!impl_, - "A CPU stub for UpdateConstrain was called instead of the correct implementation."); + impl_->scaleCoordinates(scalingMatrix); } -void UpdateConstrainGpu::scaleVelocities(const matrix /* scalingMatrix */) +void UpdateConstrainGpu::scaleVelocities(const matrix scalingMatrix) { - GMX_ASSERT(!impl_, - "A CPU stub for UpdateConstrain was called instead of the correct implementation."); + impl_->scaleVelocities(scalingMatrix); } -void UpdateConstrainGpu::set(DeviceBuffer /* d_x */, - DeviceBuffer /* d_v */, - const DeviceBuffer /* d_f */, - const InteractionDefinitions& /* idef */, - const t_mdatoms& /* md */) +void UpdateConstrainGpu::set(DeviceBuffer d_x, + DeviceBuffer d_v, + const DeviceBuffer d_f, + const InteractionDefinitions& idef, + const t_mdatoms& md) { - GMX_ASSERT(!impl_, - "A CPU stub for UpdateConstrain was called instead of the correct implementation."); + impl_->set(d_x, d_v, d_f, idef, md); } -void UpdateConstrainGpu::setPbc(const PbcType /* pbcType */, const matrix /* box */) +void UpdateConstrainGpu::setPbc(const PbcType pbcType, const matrix box) { - GMX_ASSERT(!impl_, - "A CPU stub for UpdateConstrain was called instead of the correct implementation."); + impl_->setPbc(pbcType, box); } GpuEventSynchronizer* UpdateConstrainGpu::getCoordinatesReadySync() { - GMX_ASSERT(!impl_, - "A CPU stub for UpdateConstrain was called instead of the correct implementation."); - return nullptr; + return impl_->getCoordinatesReadySync(); } -bool UpdateConstrainGpu::isNumCoupledConstraintsSupported(const gmx_mtop_t& /* mtop */) +bool UpdateConstrainGpu::isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop) { - return false; + return LincsGpu::isNumCoupledConstraintsSupported(mtop); } } // namespace gmx - -#endif /* !GMX_GPU_CUDA */ diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu deleted file mode 100644 index 7ef37c0940..0000000000 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu +++ /dev/null @@ -1,339 +0,0 @@ -/* - * This file is part of the GROMACS molecular simulation package. - * - * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by - * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, - * and including many others, as listed in the AUTHORS file in the - * top-level source directory and at http://www.gromacs.org. - * - * GROMACS is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public License - * as published by the Free Software Foundation; either version 2.1 - * of the License, or (at your option) any later version. - * - * GROMACS is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with GROMACS; if not, see - * http://www.gnu.org/licenses, or write to the Free Software Foundation, - * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - * - * If you want to redistribute modifications to GROMACS, please - * consider that scientific software is very special. Version - * control is crucial - bugs must be traceable. We will be happy to - * consider code for inclusion in the official distribution, but - * derived work must not be called official GROMACS. Details are found - * in the README & COPYING files - if they are missing, get the - * official version at http://www.gromacs.org. - * - * To help us fund GROMACS development, we humbly ask that you cite - * the research papers on the package. Check out http://www.gromacs.org. - */ -/*! \internal \file - * - * \brief Implements update and constraints class using CUDA. - * - * The class combines Leap-Frog integrator with LINCS and SETTLE constraints. - * - * \todo The computational procedures in members should be integrated to improve - * computational performance. - * - * \author Artem Zhmurov - * - * \ingroup module_mdlib - */ -#include "gmxpre.h" - -#include "update_constrain_gpu_impl.h" - -#include -#include - -#include - -#include - -#include "gromacs/gpu_utils/cudautils.cuh" -#include "gromacs/gpu_utils/device_context.h" -#include "gromacs/gpu_utils/device_stream.h" -#include "gromacs/gpu_utils/devicebuffer.h" -#include "gromacs/gpu_utils/gpueventsynchronizer.cuh" -#include "gromacs/gpu_utils/gputraits.cuh" -#include "gromacs/gpu_utils/vectype_ops.cuh" -#include "gromacs/mdlib/leapfrog_gpu.h" -#include "gromacs/mdlib/update_constrain_gpu.h" -#include "gromacs/mdtypes/mdatom.h" -#include "gromacs/timing/wallcycle.h" - -namespace gmx -{ -/*!\brief Number of CUDA threads in a block - * - * \todo Check if using smaller block size will lead to better performance. - */ -constexpr static int c_threadsPerBlock = 256; -//! Maximum number of threads in a block (for __launch_bounds__) -constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock; - -__launch_bounds__(c_maxThreadsPerBlock) __global__ - static void scaleCoordinates_kernel(const int numAtoms, - float3* __restrict__ gm_x, - const ScalingMatrix scalingMatrix) -{ - int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; - if (threadIndex < numAtoms) - { - float3 x = gm_x[threadIndex]; - - x.x = scalingMatrix.xx * x.x + scalingMatrix.yx * x.y + scalingMatrix.zx * x.z; - x.y = scalingMatrix.yy * x.y + scalingMatrix.zy * x.z; - x.z = scalingMatrix.zz * x.z; - - gm_x[threadIndex] = x; - } -} - -void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer* fReadyOnDevice, - const real dt, - const bool updateVelocities, - const bool computeVirial, - tensor virial, - const bool doTemperatureScaling, - gmx::ArrayRef tcstat, - const bool doParrinelloRahman, - const float dtPressureCouple, - const matrix prVelocityScalingMatrix) -{ - wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); - wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); - - // Clearing virial matrix - // TODO There is no point in having separate virial matrix for constraints - clear_mat(virial); - - // Make sure that the forces are ready on device before proceeding with the update. - fReadyOnDevice->enqueueWaitEvent(deviceStream_); - - // The integrate should save a copy of the current coordinates in d_xp_ and write updated - // once into d_x_. The d_xp_ is only needed by constraints. - integrator_->integrate( - d_x_, d_xp_, d_v_, d_f_, dt, doTemperatureScaling, tcstat, doParrinelloRahman, dtPressureCouple, prVelocityScalingMatrix); - // Constraints need both coordinates before (d_x_) and after (d_xp_) update. However, after constraints - // are applied, the d_x_ can be discarded. So we intentionally swap the d_x_ and d_xp_ here to avoid the - // d_xp_ -> d_x_ copy after constraints. Note that the integrate saves them in the wrong order as well. - lincsGpu_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial, pbcAiuc_); - settleGpu_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial, pbcAiuc_); - - // scaledVirial -> virial (methods above returns scaled values) - float scaleFactor = 0.5f / (dt * dt); - for (int i = 0; i < DIM; i++) - { - for (int j = 0; j < DIM; j++) - { - virial[i][j] = scaleFactor * virial[i][j]; - } - } - - coordinatesReady_->markEvent(deviceStream_); - - wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); - wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); - - return; -} - -void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix) -{ - wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); - wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); - - ScalingMatrix mu(scalingMatrix); - - const auto kernelArgs = prepareGpuKernelArguments( - scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, &numAtoms_, &d_x_, &mu); - launchGpuKernel(scaleCoordinates_kernel, - coordinateScalingKernelLaunchConfig_, - deviceStream_, - nullptr, - "scaleCoordinates_kernel", - kernelArgs); - // TODO: Although this only happens on the pressure coupling steps, this synchronization - // can affect the performance if nstpcouple is small. - deviceStream_.synchronize(); - - wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); - wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); -} - -void UpdateConstrainGpu::Impl::scaleVelocities(const matrix scalingMatrix) -{ - wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); - wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); - - ScalingMatrix mu(scalingMatrix); - - const auto kernelArgs = prepareGpuKernelArguments( - scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, &numAtoms_, &d_v_, &mu); - launchGpuKernel(scaleCoordinates_kernel, - coordinateScalingKernelLaunchConfig_, - deviceStream_, - nullptr, - "scaleCoordinates_kernel", - kernelArgs); - // TODO: Although this only happens on the pressure coupling steps, this synchronization - // can affect the performance if nstpcouple is small. - deviceStream_.synchronize(); - - wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); - wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); -} - -UpdateConstrainGpu::Impl::Impl(const t_inputrec& ir, - const gmx_mtop_t& mtop, - const int numTempScaleValues, - const DeviceContext& deviceContext, - const DeviceStream& deviceStream, - GpuEventSynchronizer* xUpdatedOnDevice, - gmx_wallcycle* wcycle) : - deviceContext_(deviceContext), - deviceStream_(deviceStream), - coordinatesReady_(xUpdatedOnDevice), - wcycle_(wcycle) -{ - GMX_ASSERT(xUpdatedOnDevice != nullptr, "The event synchronizer can not be nullptr."); - - - integrator_ = std::make_unique(deviceContext_, deviceStream_, numTempScaleValues); - lincsGpu_ = std::make_unique(ir.nLincsIter, ir.nProjOrder, deviceContext_, deviceStream_); - settleGpu_ = std::make_unique(mtop, deviceContext_, deviceStream_); - - coordinateScalingKernelLaunchConfig_.blockSize[0] = c_threadsPerBlock; - coordinateScalingKernelLaunchConfig_.blockSize[1] = 1; - coordinateScalingKernelLaunchConfig_.blockSize[2] = 1; - coordinateScalingKernelLaunchConfig_.sharedMemorySize = 0; -} - -UpdateConstrainGpu::Impl::~Impl() {} - -void UpdateConstrainGpu::Impl::set(DeviceBuffer d_x, - DeviceBuffer d_v, - const DeviceBuffer d_f, - const InteractionDefinitions& idef, - const t_mdatoms& md) -{ - // TODO wallcycle - wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); - wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); - - GMX_ASSERT(d_x != nullptr, "Coordinates device buffer should not be null."); - GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null."); - GMX_ASSERT(d_f != nullptr, "Forces device buffer should not be null."); - - d_x_ = d_x; - d_v_ = d_v; - d_f_ = d_f; - - numAtoms_ = md.nr; - - reallocateDeviceBuffer(&d_xp_, numAtoms_, &numXp_, &numXpAlloc_, deviceContext_); - - reallocateDeviceBuffer( - &d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_); - - // Integrator should also update something, but it does not even have a method yet - integrator_->set(numAtoms_, md.invmass, md.cTC); - lincsGpu_->set(idef, numAtoms_, md.invmass); - settleGpu_->set(idef); - - coordinateScalingKernelLaunchConfig_.gridSize[0] = - (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock; - - wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain); - wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); -} - -void UpdateConstrainGpu::Impl::setPbc(const PbcType pbcType, const matrix box) -{ - // TODO wallcycle - setPbcAiuc(numPbcDimensions(pbcType), box, &pbcAiuc_); -} - -GpuEventSynchronizer* UpdateConstrainGpu::Impl::getCoordinatesReadySync() -{ - return coordinatesReady_; -} - -UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec& ir, - const gmx_mtop_t& mtop, - const int numTempScaleValues, - const DeviceContext& deviceContext, - const DeviceStream& deviceStream, - GpuEventSynchronizer* xUpdatedOnDevice, - gmx_wallcycle* wcycle) : - impl_(new Impl(ir, mtop, numTempScaleValues, deviceContext, deviceStream, xUpdatedOnDevice, wcycle)) -{ -} - -UpdateConstrainGpu::~UpdateConstrainGpu() = default; - -void UpdateConstrainGpu::integrate(GpuEventSynchronizer* fReadyOnDevice, - const real dt, - const bool updateVelocities, - const bool computeVirial, - tensor virialScaled, - const bool doTemperatureScaling, - gmx::ArrayRef tcstat, - const bool doParrinelloRahman, - const float dtPressureCouple, - const matrix prVelocityScalingMatrix) -{ - impl_->integrate(fReadyOnDevice, - dt, - updateVelocities, - computeVirial, - virialScaled, - doTemperatureScaling, - tcstat, - doParrinelloRahman, - dtPressureCouple, - prVelocityScalingMatrix); -} - -void UpdateConstrainGpu::scaleCoordinates(const matrix scalingMatrix) -{ - impl_->scaleCoordinates(scalingMatrix); -} - -void UpdateConstrainGpu::scaleVelocities(const matrix scalingMatrix) -{ - impl_->scaleVelocities(scalingMatrix); -} - -void UpdateConstrainGpu::set(DeviceBuffer d_x, - DeviceBuffer d_v, - const DeviceBuffer d_f, - const InteractionDefinitions& idef, - const t_mdatoms& md) -{ - impl_->set(d_x, d_v, d_f, idef, md); -} - -void UpdateConstrainGpu::setPbc(const PbcType pbcType, const matrix box) -{ - impl_->setPbc(pbcType, box); -} - -GpuEventSynchronizer* UpdateConstrainGpu::getCoordinatesReadySync() -{ - return impl_->getCoordinatesReadySync(); -} - -bool UpdateConstrainGpu::isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop) -{ - return LincsGpu::isNumCoupledConstraintsSupported(mtop); -} - -} // namespace gmx diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h index 9b260d1a99..150c944aae 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.h +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h @@ -228,19 +228,6 @@ private: gmx_wallcycle* wcycle_ = nullptr; }; -/*! \brief Scaling matrix struct. - * - * \todo Should be generalized. - */ -struct ScalingMatrix -{ - ScalingMatrix(const matrix m) : - xx(m[XX][XX]), yy(m[YY][YY]), zz(m[ZZ][ZZ]), yx(m[YY][XX]), zx(m[ZZ][XX]), zy(m[ZZ][YY]) - { - } - float xx, yy, zz, yx, zx, zy; -}; - } // namespace gmx #endif // GMX_MDLIB_UPDATE_CONSTRAIN_GPU_IMPL_H diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl_stubs.cpp b/src/gromacs/mdlib/update_constrain_gpu_impl_stubs.cpp new file mode 100644 index 0000000000..38b55c7931 --- /dev/null +++ b/src/gromacs/mdlib/update_constrain_gpu_impl_stubs.cpp @@ -0,0 +1,131 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Stub for update and constraints class CPU implementation. + * + * \author Artem Zhmurov + * + * \ingroup module_mdlib + */ +#include "gmxpre.h" + +#include "config.h" + +#include "gromacs/mdlib/update_constrain_gpu.h" +#include "gromacs/utility/gmxassert.h" + +#if !GMX_GPU_CUDA && !GMX_GPU_SYCL + +namespace gmx +{ + +class UpdateConstrainGpu::Impl +{ +}; + +UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec& /* ir */, + const gmx_mtop_t& /* mtop */, + const int /* numTempScaleValues */, + const DeviceContext& /* deviceContext */, + const DeviceStream& /* deviceStream */, + GpuEventSynchronizer* /* xUpdatedOnDevice */, + gmx_wallcycle* /*wcycle*/) : + impl_(nullptr) +{ + GMX_ASSERT(!impl_, + "A CPU stub for UpdateConstrain was called instead of the correct implementation."); +} + +UpdateConstrainGpu::~UpdateConstrainGpu() = default; + +void UpdateConstrainGpu::integrate(GpuEventSynchronizer* /* fReadyOnDevice */, + const real /* dt */, + const bool /* updateVelocities */, + const bool /* computeVirial */, + tensor /* virialScaled */, + const bool /* doTemperatureScaling */, + gmx::ArrayRef /* tcstat */, + const bool /* doParrinelloRahman */, + const float /* dtPressureCouple */, + const matrix /* prVelocityScalingMatrix*/) +{ + GMX_ASSERT(!impl_, + "A CPU stub for UpdateConstrain was called instead of the correct implementation."); +} + +void UpdateConstrainGpu::scaleCoordinates(const matrix /* scalingMatrix */) +{ + GMX_ASSERT(!impl_, + "A CPU stub for UpdateConstrain was called instead of the correct implementation."); +} + +void UpdateConstrainGpu::scaleVelocities(const matrix /* scalingMatrix */) +{ + GMX_ASSERT(!impl_, + "A CPU stub for UpdateConstrain was called instead of the correct implementation."); +} + +void UpdateConstrainGpu::set(DeviceBuffer /* d_x */, + DeviceBuffer /* d_v */, + const DeviceBuffer /* d_f */, + const InteractionDefinitions& /* idef */, + const t_mdatoms& /* md */) +{ + GMX_ASSERT(!impl_, + "A CPU stub for UpdateConstrain was called instead of the correct implementation."); +} + +void UpdateConstrainGpu::setPbc(const PbcType /* pbcType */, const matrix /* box */) +{ + GMX_ASSERT(!impl_, + "A CPU stub for UpdateConstrain was called instead of the correct implementation."); +} + +GpuEventSynchronizer* UpdateConstrainGpu::getCoordinatesReadySync() +{ + GMX_ASSERT(!impl_, + "A CPU stub for UpdateConstrain was called instead of the correct implementation."); + return nullptr; +} + +bool UpdateConstrainGpu::isNumCoupledConstraintsSupported(const gmx_mtop_t& /* mtop */) +{ + return false; +} + +} // namespace gmx + +#endif /* !GMX_GPU_CUDA && !GMX_GPU_SYCL */ diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal.cu b/src/gromacs/mdlib/update_constrain_gpu_internal.cu new file mode 100644 index 0000000000..5787bc476e --- /dev/null +++ b/src/gromacs/mdlib/update_constrain_gpu_internal.cu @@ -0,0 +1,105 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Implements backend-specific functions of the update-constraints in CUDA. + * + * \author Artem Zhmurov + * + * \ingroup module_mdlib + */ +#include "update_constrain_gpu_internal.h" + +#include "gromacs/gpu_utils/cudautils.cuh" +#include "gromacs/gpu_utils/typecasts.cuh" +#include "gromacs/gpu_utils/vectype_ops.cuh" + +namespace gmx +{ + +/*!\brief Number of CUDA threads in a block + * + * \todo Check if using smaller block size will lead to better performance. + */ +constexpr static int c_threadsPerBlock = 256; +//! Maximum number of threads in a block (for __launch_bounds__) +constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock; + +__launch_bounds__(c_maxThreadsPerBlock) __global__ + static void scaleCoordinates_kernel(const int numAtoms, + float3* __restrict__ gm_x, + const ScalingMatrix scalingMatrix) +{ + int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; + if (threadIndex < numAtoms) + { + float3 x = gm_x[threadIndex]; + + x.x = scalingMatrix.xx * x.x + scalingMatrix.yx * x.y + scalingMatrix.zx * x.z; + x.y = scalingMatrix.yy * x.y + scalingMatrix.zy * x.z; + x.z = scalingMatrix.zz * x.z; + + gm_x[threadIndex] = x; + } +} + +void launchScaleCoordinatesKernel(const int numAtoms, + DeviceBuffer d_coordinates, + const ScalingMatrix mu, + const DeviceStream& deviceStream) +{ + KernelLaunchConfig kernelLaunchConfig; + + kernelLaunchConfig.blockSize[0] = c_threadsPerBlock; + kernelLaunchConfig.blockSize[1] = 1; + kernelLaunchConfig.blockSize[2] = 1; + kernelLaunchConfig.sharedMemorySize = 0; + + kernelLaunchConfig.gridSize[0] = (numAtoms + c_threadsPerBlock - 1) / c_threadsPerBlock; + + const auto kernelArgs = prepareGpuKernelArguments( + scaleCoordinates_kernel, kernelLaunchConfig, &numAtoms, asFloat3Pointer(&d_coordinates), &mu); + launchGpuKernel(scaleCoordinates_kernel, + kernelLaunchConfig, + deviceStream, + nullptr, + "scaleCoordinates_kernel", + kernelArgs); + // TODO: Although this only happens on the pressure coupling steps, this synchronization + // can affect the performance if nstpcouple is small. See Issue #4018 + deviceStream.synchronize(); +} + +} // namespace gmx diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal.h b/src/gromacs/mdlib/update_constrain_gpu_internal.h new file mode 100644 index 0000000000..804636f8f0 --- /dev/null +++ b/src/gromacs/mdlib/update_constrain_gpu_internal.h @@ -0,0 +1,84 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Declares GPU implementations of backend-specific update-constraints functions. + * + * \author Artem Zhmurov + * + * \ingroup module_mdlib + */ +#ifndef GMX_MDLIB_UPDATE_CONSTRAIN_GPU_INTERNAL_H +#define GMX_MDLIB_UPDATE_CONSTRAIN_GPU_INTERNAL_H + +#include "gmxpre.h" + +#include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gromacs/gpu_utils/device_stream.h" +#include "gromacs/gpu_utils/gputraits.h" + +class GpuEventSynchronizer; + +namespace gmx +{ + +/*! \brief Scaling matrix struct. + * + * \todo Should be generalized. + */ +struct ScalingMatrix +{ + ScalingMatrix(const matrix m) : + xx(m[XX][XX]), yy(m[YY][YY]), zz(m[ZZ][ZZ]), yx(m[YY][XX]), zx(m[ZZ][XX]), zy(m[ZZ][YY]) + { + } + float xx, yy, zz, yx, zx, zy; +}; + +/*! \brief Launches positions of velocities scaling kernel. + * + * \param[in] numAtoms Number of atoms in the system. + * \param[in] d_coordinates Device buffer with position or velocities to be scaled. + * \param[in] mu Scaling matrix. + * \param[in] deviceStream Stream to launch kernel in. + */ +void launchScaleCoordinatesKernel(int numAtoms, + DeviceBuffer d_coordinates, + const ScalingMatrix mu, + const DeviceStream& deviceStream); + +} // namespace gmx + +#endif // GMX_MDLIB_UPDATE_CONSTRAIN_GPU_INTERNAL_H diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp new file mode 100644 index 0000000000..72723bd99e --- /dev/null +++ b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp @@ -0,0 +1,61 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Implements backend-specific functions of the update-constraints in SYCL. + * + * \author Artem Zhmurov + * + * \ingroup module_mdlib + */ +#include "gmxpre.h" + +#include "update_constrain_gpu_internal.h" + +#include "gromacs/utility/gmxassert.h" + +namespace gmx +{ + +void launchScaleCoordinatesKernel(const int /* numAtoms */, + DeviceBuffer /* d_coordinates */, + const ScalingMatrix /* mu */, + const DeviceStream& /* deviceStream */) +{ + // SYCL_TODO + GMX_RELEASE_ASSERT(false, "Coordinates scaling is not yet implemented in SYCL."); +} + +} // namespace gmx -- 2.22.0