From 47370908b87106c4ef7ea8b99df3f86998c3e0fc Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Wed, 19 May 2021 10:53:55 +0000 Subject: [PATCH] Make common Leap-Frog device code backend-agnostic --- src/gromacs/mdlib/CMakeLists.txt | 13 +- src/gromacs/mdlib/leapfrog_gpu.cpp | 175 ++++++++++++++++++ src/gromacs/mdlib/leapfrog_gpu.h | 5 +- ...apfrog_gpu.cu => leapfrog_gpu_internal.cu} | 168 ++++------------- src/gromacs/mdlib/leapfrog_gpu_internal.h | 71 +++++++ ...ycl.cpp => leapfrog_gpu_internal_sycl.cpp} | 124 +++---------- 6 files changed, 316 insertions(+), 240 deletions(-) create mode 100644 src/gromacs/mdlib/leapfrog_gpu.cpp rename src/gromacs/mdlib/{leapfrog_gpu.cu => leapfrog_gpu_internal.cu} (62%) create mode 100644 src/gromacs/mdlib/leapfrog_gpu_internal.h rename src/gromacs/mdlib/{leapfrog_gpu_sycl.cpp => leapfrog_gpu_internal_sycl.cpp} (65%) diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt index 2e645f0362..543f879212 100644 --- a/src/gromacs/mdlib/CMakeLists.txt +++ b/src/gromacs/mdlib/CMakeLists.txt @@ -40,7 +40,8 @@ file(GLOB MDLIB_SOURCES *.cpp) list(REMOVE_ITEM MDLIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/gpuforcereduction_impl.cpp ${CMAKE_CURRENT_SOURCE_DIR}/gpuforcereduction_impl_internal_sycl.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/leapfrog_gpu_sycl.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/leapfrog_gpu.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/leapfrog_gpu_internal_sycl.cpp ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu.cpp ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu_internal_sycl.cpp ${CMAKE_CURRENT_SOURCE_DIR}/settle_gpu.cpp @@ -51,9 +52,10 @@ list(REMOVE_ITEM MDLIB_SOURCES set(MDLIB_SOURCES ${MDLIB_SOURCES} PARENT_SCOPE) if(GMX_GPU_CUDA) gmx_add_libgromacs_sources( - leapfrog_gpu.cu gpuforcereduction_impl.cpp gpuforcereduction_impl_internal.cu + leapfrog_gpu.cpp + leapfrog_gpu_internal.cu lincs_gpu.cpp lincs_gpu_internal.cu settle_gpu.cpp @@ -63,6 +65,7 @@ if(GMX_GPU_CUDA) ) _gmx_add_files_to_property(CUDA_SOURCES gpuforcereduction_impl.cpp + leapfrog_gpu.cpp lincs_gpu.cpp settle_gpu.cpp update_constrain_gpu_impl.cpp @@ -73,7 +76,8 @@ if(GMX_GPU_SYCL) gmx_add_libgromacs_sources( gpuforcereduction_impl.cpp gpuforcereduction_impl_internal_sycl.cpp - leapfrog_gpu_sycl.cpp + leapfrog_gpu.cpp + leapfrog_gpu_internal_sycl.cpp lincs_gpu.cpp lincs_gpu_internal_sycl.cpp settle_gpu.cpp @@ -85,7 +89,8 @@ if(GMX_GPU_SYCL) _gmx_add_files_to_property(SYCL_SOURCES gpuforcereduction_impl.cpp gpuforcereduction_impl_internal_sycl.cpp - leapfrog_gpu_sycl.cpp + leapfrog_gpu.cpp + leapfrog_gpu_internal_sycl.cpp lincs_gpu.cpp lincs_gpu_internal_sycl.cpp settle_gpu.cpp diff --git a/src/gromacs/mdlib/leapfrog_gpu.cpp b/src/gromacs/mdlib/leapfrog_gpu.cpp new file mode 100644 index 0000000000..b07c167c7c --- /dev/null +++ b/src/gromacs/mdlib/leapfrog_gpu.cpp @@ -0,0 +1,175 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Implements Leap-Frog using CUDA + * + * This file contains backend-agnostic code for Leap-Frog integrator class on GPU, + * including class initialization, and data-structures management. + * + * \author Artem Zhmurov + * + * \ingroup module_mdlib + */ +#include "gmxpre.h" + +#include "leapfrog_gpu.h" + +#include +#include + +#include +#include + +#include "gromacs/gpu_utils/devicebuffer.h" +#include "gromacs/math/vec.h" +#include "gromacs/mdlib/leapfrog_gpu_internal.h" +#include "gromacs/mdtypes/group.h" +#include "gromacs/pbcutil/pbc.h" +#include "gromacs/utility/arrayref.h" + +namespace gmx +{ + +void LeapFrogGpu::integrate(DeviceBuffer d_x, + DeviceBuffer d_xp, + DeviceBuffer d_v, + const DeviceBuffer d_f, + const float dt, + const bool doTemperatureScaling, + gmx::ArrayRef tcstat, + const bool doParrinelloRahman, + const float dtPressureCouple, + const matrix prVelocityScalingMatrix) +{ + + if (doTemperatureScaling) + { + GMX_ASSERT(checkDeviceBuffer(d_lambdas_, numTempScaleValues_), + "Number of temperature scaling factors changed since it was set for the " + "last time."); + GMX_ASSERT(numTempScaleValues_ == ssize(h_lambdas_), + "Number of temperature scaling factors changed since it was set for the " + "last time."); + + /* In SYCL, we could use host accessors here, without h_lambdas_. + * According to a quick test, host accessor is slightly faster when using DPC++ and + * LevelZero compared to using h_lambdas_ + cgh.copy. But with DPC++ and OpenCL, the host + * accessor waits for fReadyOnDevice in UpdateConstrainGpu::Impl::integrate. See #4023. */ + for (int i = 0; i < numTempScaleValues_; i++) + { + h_lambdas_[i] = tcstat[i].lambda; + } + copyToDeviceBuffer(&d_lambdas_, + h_lambdas_.data(), + 0, + numTempScaleValues_, + deviceStream_, + GpuApiCallBehavior::Async, + nullptr); + } + VelocityScalingType prVelocityScalingType = VelocityScalingType::None; + if (doParrinelloRahman) + { + prVelocityScalingType = VelocityScalingType::Diagonal; + GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0 + && prVelocityScalingMatrix[ZZ][YY] == 0 && prVelocityScalingMatrix[XX][YY] == 0 + && prVelocityScalingMatrix[XX][ZZ] == 0 && prVelocityScalingMatrix[YY][ZZ] == 0, + "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported " + "in GPU version of Leap-Frog integrator."); + prVelocityScalingMatrixDiagonal_ = Float3{ dtPressureCouple * prVelocityScalingMatrix[XX][XX], + dtPressureCouple * prVelocityScalingMatrix[YY][YY], + dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ] }; + } + + launchLeapFrogKernel(numAtoms_, + d_x, + d_xp, + d_v, + d_f, + d_inverseMasses_, + dt, + doTemperatureScaling, + numTempScaleValues_, + d_tempScaleGroups_, + d_lambdas_, + prVelocityScalingType, + prVelocityScalingMatrixDiagonal_, + deviceStream_); +} + +LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, + const DeviceStream& deviceStream, + const int numTempScaleValues) : + deviceContext_(deviceContext), deviceStream_(deviceStream), numTempScaleValues_(numTempScaleValues) +{ + numAtoms_ = 0; + + changePinningPolicy(&h_lambdas_, gmx::PinningPolicy::PinnedIfSupported); + + // If the temperature coupling is enabled, we need to make space for scaling factors + if (numTempScaleValues_ > 0) + { + h_lambdas_.resize(numTempScaleValues_); + reallocateDeviceBuffer( + &d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, deviceContext_); + } +} + +LeapFrogGpu::~LeapFrogGpu() +{ + freeDeviceBuffer(&d_inverseMasses_); +} + +void LeapFrogGpu::set(const int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups) +{ + numAtoms_ = numAtoms; + + reallocateDeviceBuffer( + &d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_); + copyToDeviceBuffer( + &d_inverseMasses_, inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr); + + // Temperature scale group map only used if there are more then one group + if (numTempScaleValues_ > 1) + { + reallocateDeviceBuffer( + &d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_, &numTempScaleGroupsAlloc_, deviceContext_); + copyToDeviceBuffer( + &d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr); + } +} + +} // namespace gmx diff --git a/src/gromacs/mdlib/leapfrog_gpu.h b/src/gromacs/mdlib/leapfrog_gpu.h index 9102e2a27e..0f48770178 100644 --- a/src/gromacs/mdlib/leapfrog_gpu.h +++ b/src/gromacs/mdlib/leapfrog_gpu.h @@ -125,7 +125,7 @@ public: DeviceBuffer d_xp, DeviceBuffer d_v, const DeviceBuffer d_f, - const real dt, + const float dt, const bool doTemperatureScaling, gmx::ArrayRef tcstat, const bool doParrinelloRahman, @@ -152,8 +152,7 @@ private: const DeviceContext& deviceContext_; //! GPU stream const DeviceStream& deviceStream_; - //! GPU kernel launch config - KernelLaunchConfig kernelLaunchConfig_; + //! Number of atoms int numAtoms_; diff --git a/src/gromacs/mdlib/leapfrog_gpu.cu b/src/gromacs/mdlib/leapfrog_gpu_internal.cu similarity index 62% rename from src/gromacs/mdlib/leapfrog_gpu.cu rename to src/gromacs/mdlib/leapfrog_gpu_internal.cu index 0ce1174b33..e3f9bc7b82 100644 --- a/src/gromacs/mdlib/leapfrog_gpu.cu +++ b/src/gromacs/mdlib/leapfrog_gpu_internal.cu @@ -36,9 +36,7 @@ * * \brief Implements Leap-Frog using CUDA * - * This file contains implementation of basic Leap-Frog integrator - * using CUDA, including class initialization, data-structures management - * and GPU kernel. + * This file contains CUDA implementation of back-end specific code for Leap-Frog. * * \author Artem Zhmurov * @@ -46,14 +44,7 @@ */ #include "gmxpre.h" -#include "leapfrog_gpu.h" - -#include -#include - -#include - -#include +#include "leapfrog_gpu_internal.h" #include "gromacs/gpu_utils/cudautils.cuh" #include "gromacs/gpu_utils/devicebuffer.h" @@ -101,19 +92,6 @@ constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock; * \param[in] prVelocityScalingMatrixDiagonal Diagonal elements of Parrinello-Rahman velocity scaling matrix */ template -__launch_bounds__(c_maxThreadsPerBlock) __global__ - void leapfrog_kernel(const int numAtoms, - float3* __restrict__ gm_x, - float3* __restrict__ gm_xp, - float3* __restrict__ gm_v, - const float3* __restrict__ gm_f, - const float* __restrict__ gm_inverseMasses, - const float dt, - const float* __restrict__ gm_lambdas, - const unsigned short* __restrict__ gm_tempScaleGroups, - const float3 prVelocityScalingMatrixDiagonal); - -template __launch_bounds__(c_maxThreadsPerBlock) __global__ void leapfrog_kernel(const int numAtoms, float3* __restrict__ gm_x, @@ -176,7 +154,6 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__ gm_v[threadIndex] = v; gm_x[threadIndex] = x; } - return; } /*! \brief Select templated kernel. @@ -238,124 +215,49 @@ inline auto selectLeapFrogKernelPtr(bool doTemperatureScaling, return kernelPtr; } -void LeapFrogGpu::integrate(DeviceBuffer d_x, - DeviceBuffer d_xp, - DeviceBuffer d_v, - const DeviceBuffer d_f, - const real dt, - const bool doTemperatureScaling, - gmx::ArrayRef tcstat, - const bool doParrinelloRahman, - const float dtPressureCouple, - const matrix prVelocityScalingMatrix) + +void launchLeapFrogKernel(const int numAtoms, + DeviceBuffer d_x, + DeviceBuffer d_xp, + DeviceBuffer d_v, + const DeviceBuffer d_f, + const DeviceBuffer d_inverseMasses, + const float dt, + const bool doTemperatureScaling, + const int numTempScaleValues, + const DeviceBuffer d_tempScaleGroups, + const DeviceBuffer d_lambdas, + const VelocityScalingType prVelocityScalingType, + const Float3 prVelocityScalingMatrixDiagonal, + const DeviceStream& deviceStream) { + // Checking the buffer types against the kernel argument types + static_assert(sizeof(*d_inverseMasses) == sizeof(float), "Incompatible types"); - ensureNoPendingDeviceError("In CUDA version of Leap-Frog integrator"); + KernelLaunchConfig kernelLaunchConfig; - auto kernelPtr = leapfrog_kernel; - if (doTemperatureScaling || doParrinelloRahman) - { - if (doTemperatureScaling) - { - GMX_ASSERT(numTempScaleValues_ == ssize(h_lambdas_), - "Number of temperature scaling factors changed since it was set for the " - "last time."); - for (int i = 0; i < numTempScaleValues_; i++) - { - h_lambdas_[i] = tcstat[i].lambda; - } - copyToDeviceBuffer(&d_lambdas_, - h_lambdas_.data(), - 0, - numTempScaleValues_, - deviceStream_, - GpuApiCallBehavior::Async, - nullptr); - } - VelocityScalingType prVelocityScalingType = VelocityScalingType::None; - if (doParrinelloRahman) - { - prVelocityScalingType = VelocityScalingType::Diagonal; - GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0 - && prVelocityScalingMatrix[ZZ][YY] == 0 - && prVelocityScalingMatrix[XX][YY] == 0 - && prVelocityScalingMatrix[XX][ZZ] == 0 - && prVelocityScalingMatrix[YY][ZZ] == 0, - "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported " - "in GPU version of Leap-Frog integrator."); - prVelocityScalingMatrixDiagonal_ = - Float3{ dtPressureCouple * prVelocityScalingMatrix[XX][XX], - dtPressureCouple * prVelocityScalingMatrix[YY][YY], - dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ] }; - } - kernelPtr = selectLeapFrogKernelPtr(doTemperatureScaling, numTempScaleValues_, prVelocityScalingType); - } + kernelLaunchConfig.gridSize[0] = (numAtoms + c_threadsPerBlock - 1) / c_threadsPerBlock; + kernelLaunchConfig.blockSize[0] = c_threadsPerBlock; + kernelLaunchConfig.blockSize[1] = 1; + kernelLaunchConfig.blockSize[2] = 1; + kernelLaunchConfig.sharedMemorySize = 0; + + auto kernelPtr = + selectLeapFrogKernelPtr(doTemperatureScaling, numTempScaleValues, prVelocityScalingType); - // Checking the buffer types against the kernel argument types - static_assert(sizeof(*d_inverseMasses_) == sizeof(float), "Incompatible types"); const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, - kernelLaunchConfig_, - &numAtoms_, + kernelLaunchConfig, + &numAtoms, asFloat3Pointer(&d_x), asFloat3Pointer(&d_xp), asFloat3Pointer(&d_v), asFloat3Pointer(&d_f), - &d_inverseMasses_, + &d_inverseMasses, &dt, - &d_lambdas_, - &d_tempScaleGroups_, - &prVelocityScalingMatrixDiagonal_); - launchGpuKernel(kernelPtr, kernelLaunchConfig_, deviceStream_, nullptr, "leapfrog_kernel", kernelArgs); - - return; -} - -LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, - const DeviceStream& deviceStream, - const int numTempScaleValues) : - deviceContext_(deviceContext), deviceStream_(deviceStream), numTempScaleValues_(numTempScaleValues) -{ - numAtoms_ = 0; - - changePinningPolicy(&h_lambdas_, gmx::PinningPolicy::PinnedIfSupported); - - kernelLaunchConfig_.blockSize[0] = c_threadsPerBlock; - kernelLaunchConfig_.blockSize[1] = 1; - kernelLaunchConfig_.blockSize[2] = 1; - kernelLaunchConfig_.sharedMemorySize = 0; - - // If the temperature coupling is enabled, we need to make space for scaling factors - if (numTempScaleValues_ > 0) - { - h_lambdas_.resize(numTempScaleValues_); - reallocateDeviceBuffer( - &d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, deviceContext_); - } -} - -LeapFrogGpu::~LeapFrogGpu() -{ - freeDeviceBuffer(&d_inverseMasses_); -} - -void LeapFrogGpu::set(const int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups) -{ - numAtoms_ = numAtoms; - kernelLaunchConfig_.gridSize[0] = (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock; - - reallocateDeviceBuffer( - &d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_); - copyToDeviceBuffer( - &d_inverseMasses_, inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr); - - // Temperature scale group map only used if there are more then one group - if (numTempScaleValues_ > 1) - { - reallocateDeviceBuffer( - &d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_, &numTempScaleGroupsAlloc_, deviceContext_); - copyToDeviceBuffer( - &d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr); - } + &d_lambdas, + &d_tempScaleGroups, + &prVelocityScalingMatrixDiagonal); + launchGpuKernel(kernelPtr, kernelLaunchConfig, deviceStream, nullptr, "leapfrog_kernel", kernelArgs); } } // namespace gmx diff --git a/src/gromacs/mdlib/leapfrog_gpu_internal.h b/src/gromacs/mdlib/leapfrog_gpu_internal.h new file mode 100644 index 0000000000..76330fe180 --- /dev/null +++ b/src/gromacs/mdlib/leapfrog_gpu_internal.h @@ -0,0 +1,71 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \libinternal \file + * + * \brief Declarations for backend specific GPU functions for Leap-Frog. + * + * \author Artem Zhmurov + * + * \ingroup module_mdlib + * \inlibraryapi + */ +#ifndef GMX_MDLIB_LEAPFROG_GPU_INTERNAL_H +#define GMX_MDLIB_LEAPFROG_GPU_INTERNAL_H + +#include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gromacs/mdlib/leapfrog_gpu.h" + +namespace gmx +{ + +void launchLeapFrogKernel(int numAtoms, + DeviceBuffer d_x, + DeviceBuffer d_xp, + DeviceBuffer d_v, + DeviceBuffer d_f, + DeviceBuffer d_inverseMasses, + float dt, + bool doTemperatureScaling, + int numTempScaleValues, + DeviceBuffer d_tempScaleGroups, + DeviceBuffer d_lambdas, + VelocityScalingType prVelocityScalingType, + Float3 prVelocityScalingMatrixDiagonal, + const DeviceStream& deviceStream); + + +} // namespace gmx + +#endif diff --git a/src/gromacs/mdlib/leapfrog_gpu_sycl.cpp b/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp similarity index 65% rename from src/gromacs/mdlib/leapfrog_gpu_sycl.cpp rename to src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp index 89997a69ca..566c92b1b4 100644 --- a/src/gromacs/mdlib/leapfrog_gpu_sycl.cpp +++ b/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp @@ -36,9 +36,7 @@ * * \brief Implements Leap-Frog using SYCL * - * This file contains implementation of basic Leap-Frog integrator - * using SYCL, including class initialization, data-structures management - * and GPU kernel. + * This file contains SYCL implementation of back-end specific code for Leap-Frog. * * \author Artem Zhmurov * \author Andrey Alekseenko @@ -47,6 +45,8 @@ */ #include "gmxpre.h" +#include "leapfrog_gpu_internal.h" + #include "gromacs/gpu_utils/devicebuffer.h" #include "gromacs/gpu_utils/gmxsycl.h" #include "gromacs/math/vec.h" @@ -216,114 +216,38 @@ static inline cl::sycl::event launchLeapFrogKernel(NumTempScaleValues tempScali prVelocityScalingType); } -void LeapFrogGpu::integrate(DeviceBuffer d_x, - DeviceBuffer d_xp, - DeviceBuffer d_v, - DeviceBuffer d_f, - const real dt, - const bool doTemperatureScaling, - gmx::ArrayRef tcstat, - const bool doParrinelloRahman, - const float dtPressureCouple, - const matrix prVelocityScalingMatrix) +void launchLeapFrogKernel(int numAtoms, + DeviceBuffer d_x, + DeviceBuffer d_xp, + DeviceBuffer d_v, + const DeviceBuffer d_f, + const DeviceBuffer d_inverseMasses, + const float dt, + const bool doTemperatureScaling, + const int numTempScaleValues, + const DeviceBuffer d_tempScaleGroups, + const DeviceBuffer d_lambdas, + const VelocityScalingType prVelocityScalingType, + const Float3 prVelocityScalingMatrixDiagonal, + const DeviceStream& deviceStream) { - if (doTemperatureScaling) - { - GMX_ASSERT(checkDeviceBuffer(d_lambdas_, numTempScaleValues_), - "Number of temperature scaling factors changed since it was set for the " - "last time."); - GMX_RELEASE_ASSERT(gmx::ssize(h_lambdas_) == numTempScaleValues_, - "Number of temperature scaling factors changed since it was set for the " - "last time."); - /* We could use host accessors here, without h_lambdas_. - * According to a quick test, host accessor is slightly faster when using DPC++ and - * LevelZero compared to using h_lambdas_ + cgh.copy. But with DPC++ and OpenCL, the host - * accessor waits for fReadyOnDevice in UpdateConstrainGpu::Impl::integrate. See #4023. */ - - for (int i = 0; i < numTempScaleValues_; i++) - { - h_lambdas_[i] = tcstat[i].lambda; - } - copyToDeviceBuffer(&d_lambdas_, - h_lambdas_.data(), - 0, - numTempScaleValues_, - deviceStream_, - GpuApiCallBehavior::Async, - nullptr); - } NumTempScaleValues tempVelocityScalingType = - getTempScalingType(doTemperatureScaling, numTempScaleValues_); + getTempScalingType(doTemperatureScaling, numTempScaleValues); - VelocityScalingType prVelocityScalingType = VelocityScalingType::None; - if (doParrinelloRahman) - { - prVelocityScalingType = VelocityScalingType::Diagonal; - GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0 - && prVelocityScalingMatrix[ZZ][YY] == 0 && prVelocityScalingMatrix[XX][YY] == 0 - && prVelocityScalingMatrix[XX][ZZ] == 0 && prVelocityScalingMatrix[YY][ZZ] == 0, - "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported " - "in GPU version of Leap-Frog integrator."); - prVelocityScalingMatrixDiagonal_ = dtPressureCouple - * Float3{ prVelocityScalingMatrix[XX][XX], - prVelocityScalingMatrix[YY][YY], - prVelocityScalingMatrix[ZZ][ZZ] }; - } launchLeapFrogKernel(tempVelocityScalingType, prVelocityScalingType, - deviceStream_, - numAtoms_, + deviceStream, + numAtoms, d_x, d_xp, d_v, d_f, - d_inverseMasses_, + d_inverseMasses, dt, - d_lambdas_, - d_tempScaleGroups_, - prVelocityScalingMatrixDiagonal_); -} - -LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, - const DeviceStream& deviceStream, - const int numTempScaleValues) : - deviceContext_(deviceContext), - deviceStream_(deviceStream), - numAtoms_(0), - numTempScaleValues_(numTempScaleValues) -{ - // If the temperature coupling is enabled, we need to make space for scaling factors - if (numTempScaleValues_ > 0) - { - h_lambdas_.resize(numTempScaleValues_); - reallocateDeviceBuffer( - &d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, deviceContext_); - } -} - -LeapFrogGpu::~LeapFrogGpu() -{ - freeDeviceBuffer(&d_inverseMasses_); -} - -void LeapFrogGpu::set(const int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups) -{ - numAtoms_ = numAtoms; - - reallocateDeviceBuffer( - &d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_); - copyToDeviceBuffer( - &d_inverseMasses_, inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr); - - // Temperature scale group map only used if there are more then one group - if (numTempScaleValues_ > 1) - { - reallocateDeviceBuffer( - &d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_, &numTempScaleGroupsAlloc_, deviceContext_); - copyToDeviceBuffer( - &d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr); - } + d_lambdas, + d_tempScaleGroups, + prVelocityScalingMatrixDiagonal); } } // namespace gmx -- 2.22.0