Make common Leap-Frog device code backend-agnostic

author Artem Zhmurov <zhmurov@gmail.com>

Wed, 19 May 2021 10:53:55 +0000 (10:53 +0000)

committer Artem Zhmurov <zhmurov@gmail.com>

Wed, 19 May 2021 10:53:55 +0000 (10:53 +0000)
author Artem Zhmurov <zhmurov@gmail.com>
Wed, 19 May 2021 10:53:55 +0000 (10:53 +0000)
committer Artem Zhmurov <zhmurov@gmail.com>
Wed, 19 May 2021 10:53:55 +0000 (10:53 +0000)
diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt

index 2e645f0362fcef091e78eebe2e9f44245e743c07..543f879212adc7cecb99669c9dba29284e4ad68f 100644 (file)
--- a/src/gromacs/mdlib/CMakeLists.txt
+++ b/src/gromacs/mdlib/CMakeLists.txt
@@ -40,7 +40,8 @@ file(GLOB MDLIB_SOURCES *.cpp)
  list(REMOVE_ITEM MDLIB_SOURCES
      ${CMAKE_CURRENT_SOURCE_DIR}/gpuforcereduction_impl.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/gpuforcereduction_impl_internal_sycl.cpp
  list(REMOVE_ITEM MDLIB_SOURCES
      ${CMAKE_CURRENT_SOURCE_DIR}/gpuforcereduction_impl.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/gpuforcereduction_impl_internal_sycl.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/leapfrog_gpu_sycl.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/leapfrog_gpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/leapfrog_gpu_internal_sycl.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu_internal_sycl.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/settle_gpu.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu_internal_sycl.cpp
      ${CMAKE_CURRENT_SOURCE_DIR}/settle_gpu.cpp
@@ -51,9 +52,10 @@ list(REMOVE_ITEM MDLIB_SOURCES
  set(MDLIB_SOURCES ${MDLIB_SOURCES} PARENT_SCOPE)
  if(GMX_GPU_CUDA)
      gmx_add_libgromacs_sources(
  set(MDLIB_SOURCES ${MDLIB_SOURCES} PARENT_SCOPE)
  if(GMX_GPU_CUDA)
      gmx_add_libgromacs_sources(
-       leapfrog_gpu.cu
         gpuforcereduction_impl.cpp
         gpuforcereduction_impl_internal.cu
         gpuforcereduction_impl.cpp
         gpuforcereduction_impl_internal.cu
+       leapfrog_gpu.cpp
+       leapfrog_gpu_internal.cu
         lincs_gpu.cpp
         lincs_gpu_internal.cu
         settle_gpu.cpp
         lincs_gpu.cpp
         lincs_gpu_internal.cu
         settle_gpu.cpp
@@ -63,6 +65,7 @@ if(GMX_GPU_CUDA)
         )
      _gmx_add_files_to_property(CUDA_SOURCES
         gpuforcereduction_impl.cpp
         )
      _gmx_add_files_to_property(CUDA_SOURCES
         gpuforcereduction_impl.cpp
+       leapfrog_gpu.cpp
         lincs_gpu.cpp
         settle_gpu.cpp
         update_constrain_gpu_impl.cpp
         lincs_gpu.cpp
         settle_gpu.cpp
         update_constrain_gpu_impl.cpp
@@ -73,7 +76,8 @@ if(GMX_GPU_SYCL)
      gmx_add_libgromacs_sources(
          gpuforcereduction_impl.cpp
          gpuforcereduction_impl_internal_sycl.cpp
      gmx_add_libgromacs_sources(
          gpuforcereduction_impl.cpp
          gpuforcereduction_impl_internal_sycl.cpp
-        leapfrog_gpu_sycl.cpp
+        leapfrog_gpu.cpp
+        leapfrog_gpu_internal_sycl.cpp
          lincs_gpu.cpp
          lincs_gpu_internal_sycl.cpp
          settle_gpu.cpp
          lincs_gpu.cpp
          lincs_gpu_internal_sycl.cpp
          settle_gpu.cpp
@@ -85,7 +89,8 @@ if(GMX_GPU_SYCL)
      _gmx_add_files_to_property(SYCL_SOURCES
          gpuforcereduction_impl.cpp
          gpuforcereduction_impl_internal_sycl.cpp
      _gmx_add_files_to_property(SYCL_SOURCES
          gpuforcereduction_impl.cpp
          gpuforcereduction_impl_internal_sycl.cpp
-        leapfrog_gpu_sycl.cpp
+        leapfrog_gpu.cpp
+        leapfrog_gpu_internal_sycl.cpp
          lincs_gpu.cpp
          lincs_gpu_internal_sycl.cpp
          settle_gpu.cpp
          lincs_gpu.cpp
          lincs_gpu_internal_sycl.cpp
          settle_gpu.cpp
diff --git a/src/gromacs/mdlib/leapfrog_gpu.cpp b/src/gromacs/mdlib/leapfrog_gpu.cpp

new file mode 100644 (file)

index 0000000..b07c167
--- /dev/null
+++ b/src/gromacs/mdlib/leapfrog_gpu.cpp
@@ -0,0 +1,175 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements Leap-Frog using CUDA
+ *
+ * This file contains backend-agnostic code for Leap-Frog integrator class on GPU,
+ * including class initialization, and data-structures management.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "leapfrog_gpu.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <cmath>
+
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/leapfrog_gpu_internal.h"
+#include "gromacs/mdtypes/group.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/utility/arrayref.h"
+
+namespace gmx
+{
+
+void LeapFrogGpu::integrate(DeviceBuffer<Float3>              d_x,
+                            DeviceBuffer<Float3>              d_xp,
+                            DeviceBuffer<Float3>              d_v,
+                            const DeviceBuffer<Float3>        d_f,
+                            const float                       dt,
+                            const bool                        doTemperatureScaling,
+                            gmx::ArrayRef<const t_grp_tcstat> tcstat,
+                            const bool                        doParrinelloRahman,
+                            const float                       dtPressureCouple,
+                            const matrix                      prVelocityScalingMatrix)
+{
+
+    if (doTemperatureScaling)
+    {
+        GMX_ASSERT(checkDeviceBuffer(d_lambdas_, numTempScaleValues_),
+                   "Number of temperature scaling factors changed since it was set for the "
+                   "last time.");
+        GMX_ASSERT(numTempScaleValues_ == ssize(h_lambdas_),
+                   "Number of temperature scaling factors changed since it was set for the "
+                   "last time.");
+
+        /* In SYCL, we could use host accessors here, without h_lambdas_.
+         * According to a quick test, host accessor is slightly faster when using DPC++ and
+         * LevelZero compared to using h_lambdas_ + cgh.copy. But with DPC++ and OpenCL, the host
+         * accessor waits for fReadyOnDevice in UpdateConstrainGpu::Impl::integrate. See #4023. */
+        for (int i = 0; i < numTempScaleValues_; i++)
+        {
+            h_lambdas_[i] = tcstat[i].lambda;
+        }
+        copyToDeviceBuffer(&d_lambdas_,
+                           h_lambdas_.data(),
+                           0,
+                           numTempScaleValues_,
+                           deviceStream_,
+                           GpuApiCallBehavior::Async,
+                           nullptr);
+    }
+    VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
+    if (doParrinelloRahman)
+    {
+        prVelocityScalingType = VelocityScalingType::Diagonal;
+        GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0
+                           && prVelocityScalingMatrix[ZZ][YY] == 0 && prVelocityScalingMatrix[XX][YY] == 0
+                           && prVelocityScalingMatrix[XX][ZZ] == 0 && prVelocityScalingMatrix[YY][ZZ] == 0,
+                   "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
+                   "in GPU version of Leap-Frog integrator.");
+        prVelocityScalingMatrixDiagonal_ = Float3{ dtPressureCouple * prVelocityScalingMatrix[XX][XX],
+                                                   dtPressureCouple * prVelocityScalingMatrix[YY][YY],
+                                                   dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ] };
+    }
+
+    launchLeapFrogKernel(numAtoms_,
+                         d_x,
+                         d_xp,
+                         d_v,
+                         d_f,
+                         d_inverseMasses_,
+                         dt,
+                         doTemperatureScaling,
+                         numTempScaleValues_,
+                         d_tempScaleGroups_,
+                         d_lambdas_,
+                         prVelocityScalingType,
+                         prVelocityScalingMatrixDiagonal_,
+                         deviceStream_);
+}
+
+LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext,
+                         const DeviceStream&  deviceStream,
+                         const int            numTempScaleValues) :
+    deviceContext_(deviceContext), deviceStream_(deviceStream), numTempScaleValues_(numTempScaleValues)
+{
+    numAtoms_ = 0;
+
+    changePinningPolicy(&h_lambdas_, gmx::PinningPolicy::PinnedIfSupported);
+
+    // If the temperature coupling is enabled, we need to make space for scaling factors
+    if (numTempScaleValues_ > 0)
+    {
+        h_lambdas_.resize(numTempScaleValues_);
+        reallocateDeviceBuffer(
+                &d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, deviceContext_);
+    }
+}
+
+LeapFrogGpu::~LeapFrogGpu()
+{
+    freeDeviceBuffer(&d_inverseMasses_);
+}
+
+void LeapFrogGpu::set(const int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups)
+{
+    numAtoms_ = numAtoms;
+
+    reallocateDeviceBuffer(
+            &d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_);
+    copyToDeviceBuffer(
+            &d_inverseMasses_, inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
+
+    // Temperature scale group map only used if there are more then one group
+    if (numTempScaleValues_ > 1)
+    {
+        reallocateDeviceBuffer(
+                &d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_, &numTempScaleGroupsAlloc_, deviceContext_);
+        copyToDeviceBuffer(
+                &d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
+    }
+}
+
+} // namespace gmx
diff --git a/src/gromacs/mdlib/leapfrog_gpu.h b/src/gromacs/mdlib/leapfrog_gpu.h

index 9102e2a27e9a81ea5ec7c954761c2c149caa8dcd..0f487701784c3524fe69508e3185624eaac4c400 100644 (file)
--- a/src/gromacs/mdlib/leapfrog_gpu.h
+++ b/src/gromacs/mdlib/leapfrog_gpu.h
@@ -125,7 +125,7 @@ public:
                     DeviceBuffer<Float3>              d_xp,
                     DeviceBuffer<Float3>              d_v,
                     const DeviceBuffer<Float3>        d_f,
                     DeviceBuffer<Float3>              d_xp,
                     DeviceBuffer<Float3>              d_v,
                     const DeviceBuffer<Float3>        d_f,
-                   const real                        dt,
+                   const float                       dt,
                     const bool                        doTemperatureScaling,
                     gmx::ArrayRef<const t_grp_tcstat> tcstat,
                     const bool                        doParrinelloRahman,
                     const bool                        doTemperatureScaling,
                     gmx::ArrayRef<const t_grp_tcstat> tcstat,
                     const bool                        doParrinelloRahman,
@@ -152,8 +152,7 @@ private:
      const DeviceContext& deviceContext_;
      //! GPU stream
      const DeviceStream& deviceStream_;
      const DeviceContext& deviceContext_;
      //! GPU stream
      const DeviceStream& deviceStream_;
-    //! GPU kernel launch config
-    KernelLaunchConfig kernelLaunchConfig_;
+
      //! Number of atoms
      int numAtoms_;
  
      //! Number of atoms
      int numAtoms_;
  
diff --git a/src/gromacs/mdlib/leapfrog_gpu.cu b/src/gromacs/mdlib/leapfrog_gpu_internal.cu

similarity index 62%

rename from src/gromacs/mdlib/leapfrog_gpu.cu

rename to src/gromacs/mdlib/leapfrog_gpu_internal.cu

index 0ce1174b336e59d4153c6f923aebbd9ec83231c2..e3f9bc7b82cd1d916cb642fb86914b86db29babd 100644 (file)
--- a/src/gromacs/mdlib/leapfrog_gpu.cu
+++ b/src/gromacs/mdlib/leapfrog_gpu_internal.cu
@@ -36,9 +36,7 @@
   *
   * \brief Implements Leap-Frog using CUDA
   *
   *
   * \brief Implements Leap-Frog using CUDA
   *
- * This file contains implementation of basic Leap-Frog integrator
- * using CUDA, including class initialization, data-structures management
- * and GPU kernel.
+ * This file contains CUDA implementation of back-end specific code for Leap-Frog.
   *
   * \author Artem Zhmurov <zhmurov@gmail.com>
   *
   *
   * \author Artem Zhmurov <zhmurov@gmail.com>
   *
@@ -46,14 +44,7 @@
   */
  #include "gmxpre.h"
  
   */
  #include "gmxpre.h"
  
-#include "leapfrog_gpu.h"
-
-#include <assert.h>
-#include <stdio.h>
-
-#include <cmath>
-
-#include <algorithm>
+#include "leapfrog_gpu_internal.h"
  
  #include "gromacs/gpu_utils/cudautils.cuh"
  #include "gromacs/gpu_utils/devicebuffer.h"
  
  #include "gromacs/gpu_utils/cudautils.cuh"
  #include "gromacs/gpu_utils/devicebuffer.h"
@@ -101,19 +92,6 @@ constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
   * \param[in]     prVelocityScalingMatrixDiagonal  Diagonal elements of Parrinello-Rahman velocity scaling matrix
   */
  template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
   * \param[in]     prVelocityScalingMatrixDiagonal  Diagonal elements of Parrinello-Rahman velocity scaling matrix
   */
  template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
-__launch_bounds__(c_maxThreadsPerBlock) __global__
-        void leapfrog_kernel(const int numAtoms,
-                             float3* __restrict__ gm_x,
-                             float3* __restrict__ gm_xp,
-                             float3* __restrict__ gm_v,
-                             const float3* __restrict__ gm_f,
-                             const float* __restrict__ gm_inverseMasses,
-                             const float dt,
-                             const float* __restrict__ gm_lambdas,
-                             const unsigned short* __restrict__ gm_tempScaleGroups,
-                             const float3 prVelocityScalingMatrixDiagonal);
-
-template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
  __launch_bounds__(c_maxThreadsPerBlock) __global__
          void leapfrog_kernel(const int numAtoms,
                               float3* __restrict__ gm_x,
  __launch_bounds__(c_maxThreadsPerBlock) __global__
          void leapfrog_kernel(const int numAtoms,
                               float3* __restrict__ gm_x,
@@ -176,7 +154,6 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
          gm_v[threadIndex] = v;
          gm_x[threadIndex] = x;
      }
          gm_v[threadIndex] = v;
          gm_x[threadIndex] = x;
      }
-    return;
  }
  
  /*! \brief Select templated kernel.
  }
  
  /*! \brief Select templated kernel.
@@ -238,124 +215,49 @@ inline auto selectLeapFrogKernelPtr(bool                doTemperatureScaling,
      return kernelPtr;
  }
  
      return kernelPtr;
  }
  
-void LeapFrogGpu::integrate(DeviceBuffer<Float3>              d_x,
-                            DeviceBuffer<Float3>              d_xp,
-                            DeviceBuffer<Float3>              d_v,
-                            const DeviceBuffer<Float3>        d_f,
-                            const real                        dt,
-                            const bool                        doTemperatureScaling,
-                            gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                            const bool                        doParrinelloRahman,
-                            const float                       dtPressureCouple,
-                            const matrix                      prVelocityScalingMatrix)
+
+void launchLeapFrogKernel(const int                          numAtoms,
+                          DeviceBuffer<Float3>               d_x,
+                          DeviceBuffer<Float3>               d_xp,
+                          DeviceBuffer<Float3>               d_v,
+                          const DeviceBuffer<Float3>         d_f,
+                          const DeviceBuffer<float>          d_inverseMasses,
+                          const float                        dt,
+                          const bool                         doTemperatureScaling,
+                          const int                          numTempScaleValues,
+                          const DeviceBuffer<unsigned short> d_tempScaleGroups,
+                          const DeviceBuffer<float>          d_lambdas,
+                          const VelocityScalingType          prVelocityScalingType,
+                          const Float3                       prVelocityScalingMatrixDiagonal,
+                          const DeviceStream&                deviceStream)
  {
  {
+    // Checking the buffer types against the kernel argument types
+    static_assert(sizeof(*d_inverseMasses) == sizeof(float), "Incompatible types");
  
  
-    ensureNoPendingDeviceError("In CUDA version of Leap-Frog integrator");
+    KernelLaunchConfig kernelLaunchConfig;
  
  
-    auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
-    if (doTemperatureScaling || doParrinelloRahman)
-    {
-        if (doTemperatureScaling)
-        {
-            GMX_ASSERT(numTempScaleValues_ == ssize(h_lambdas_),
-                       "Number of temperature scaling factors changed since it was set for the "
-                       "last time.");
-            for (int i = 0; i < numTempScaleValues_; i++)
-            {
-                h_lambdas_[i] = tcstat[i].lambda;
-            }
-            copyToDeviceBuffer(&d_lambdas_,
-                               h_lambdas_.data(),
-                               0,
-                               numTempScaleValues_,
-                               deviceStream_,
-                               GpuApiCallBehavior::Async,
-                               nullptr);
-        }
-        VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
-        if (doParrinelloRahman)
-        {
-            prVelocityScalingType = VelocityScalingType::Diagonal;
-            GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0
-                               && prVelocityScalingMatrix[ZZ][YY] == 0
-                               && prVelocityScalingMatrix[XX][YY] == 0
-                               && prVelocityScalingMatrix[XX][ZZ] == 0
-                               && prVelocityScalingMatrix[YY][ZZ] == 0,
-                       "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
-                       "in GPU version of Leap-Frog integrator.");
-            prVelocityScalingMatrixDiagonal_ =
-                    Float3{ dtPressureCouple * prVelocityScalingMatrix[XX][XX],
-                            dtPressureCouple * prVelocityScalingMatrix[YY][YY],
-                            dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ] };
-        }
-        kernelPtr = selectLeapFrogKernelPtr(doTemperatureScaling, numTempScaleValues_, prVelocityScalingType);
-    }
+    kernelLaunchConfig.gridSize[0]      = (numAtoms + c_threadsPerBlock - 1) / c_threadsPerBlock;
+    kernelLaunchConfig.blockSize[0]     = c_threadsPerBlock;
+    kernelLaunchConfig.blockSize[1]     = 1;
+    kernelLaunchConfig.blockSize[2]     = 1;
+    kernelLaunchConfig.sharedMemorySize = 0;
+
+    auto kernelPtr =
+            selectLeapFrogKernelPtr(doTemperatureScaling, numTempScaleValues, prVelocityScalingType);
  
  
-    // Checking the buffer types against the kernel argument types
-    static_assert(sizeof(*d_inverseMasses_) == sizeof(float), "Incompatible types");
      const auto kernelArgs = prepareGpuKernelArguments(kernelPtr,
      const auto kernelArgs = prepareGpuKernelArguments(kernelPtr,
-                                                      kernelLaunchConfig_,
-                                                      &numAtoms_,
+                                                      kernelLaunchConfig,
+                                                      &numAtoms,
                                                        asFloat3Pointer(&d_x),
                                                        asFloat3Pointer(&d_xp),
                                                        asFloat3Pointer(&d_v),
                                                        asFloat3Pointer(&d_f),
                                                        asFloat3Pointer(&d_x),
                                                        asFloat3Pointer(&d_xp),
                                                        asFloat3Pointer(&d_v),
                                                        asFloat3Pointer(&d_f),
-                                                      &d_inverseMasses_,
+                                                      &d_inverseMasses,
                                                        &dt,
                                                        &dt,
-                                                      &d_lambdas_,
-                                                      &d_tempScaleGroups_,
-                                                      &prVelocityScalingMatrixDiagonal_);
-    launchGpuKernel(kernelPtr, kernelLaunchConfig_, deviceStream_, nullptr, "leapfrog_kernel", kernelArgs);
-
-    return;
-}
-
-LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext,
-                         const DeviceStream&  deviceStream,
-                         const int            numTempScaleValues) :
-    deviceContext_(deviceContext), deviceStream_(deviceStream), numTempScaleValues_(numTempScaleValues)
-{
-    numAtoms_ = 0;
-
-    changePinningPolicy(&h_lambdas_, gmx::PinningPolicy::PinnedIfSupported);
-
-    kernelLaunchConfig_.blockSize[0]     = c_threadsPerBlock;
-    kernelLaunchConfig_.blockSize[1]     = 1;
-    kernelLaunchConfig_.blockSize[2]     = 1;
-    kernelLaunchConfig_.sharedMemorySize = 0;
-
-    // If the temperature coupling is enabled, we need to make space for scaling factors
-    if (numTempScaleValues_ > 0)
-    {
-        h_lambdas_.resize(numTempScaleValues_);
-        reallocateDeviceBuffer(
-                &d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, deviceContext_);
-    }
-}
-
-LeapFrogGpu::~LeapFrogGpu()
-{
-    freeDeviceBuffer(&d_inverseMasses_);
-}
-
-void LeapFrogGpu::set(const int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups)
-{
-    numAtoms_                       = numAtoms;
-    kernelLaunchConfig_.gridSize[0] = (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
-
-    reallocateDeviceBuffer(
-            &d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_);
-    copyToDeviceBuffer(
-            &d_inverseMasses_, inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
-
-    // Temperature scale group map only used if there are more then one group
-    if (numTempScaleValues_ > 1)
-    {
-        reallocateDeviceBuffer(
-                &d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_, &numTempScaleGroupsAlloc_, deviceContext_);
-        copyToDeviceBuffer(
-                &d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
-    }
+                                                      &d_lambdas,
+                                                      &d_tempScaleGroups,
+                                                      &prVelocityScalingMatrixDiagonal);
+    launchGpuKernel(kernelPtr, kernelLaunchConfig, deviceStream, nullptr, "leapfrog_kernel", kernelArgs);
  }
  
  } // namespace gmx
  }
  
  } // namespace gmx
diff --git a/src/gromacs/mdlib/leapfrog_gpu_internal.h b/src/gromacs/mdlib/leapfrog_gpu_internal.h

new file mode 100644 (file)

index 0000000..76330fe
--- /dev/null
+++ b/src/gromacs/mdlib/leapfrog_gpu_internal.h
@@ -0,0 +1,71 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *
+ * \brief Declarations for backend specific GPU functions for Leap-Frog.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ * \inlibraryapi
+ */
+#ifndef GMX_MDLIB_LEAPFROG_GPU_INTERNAL_H
+#define GMX_MDLIB_LEAPFROG_GPU_INTERNAL_H
+
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/mdlib/leapfrog_gpu.h"
+
+namespace gmx
+{
+
+void launchLeapFrogKernel(int                          numAtoms,
+                          DeviceBuffer<Float3>         d_x,
+                          DeviceBuffer<Float3>         d_xp,
+                          DeviceBuffer<Float3>         d_v,
+                          DeviceBuffer<Float3>         d_f,
+                          DeviceBuffer<float>          d_inverseMasses,
+                          float                        dt,
+                          bool                         doTemperatureScaling,
+                          int                          numTempScaleValues,
+                          DeviceBuffer<unsigned short> d_tempScaleGroups,
+                          DeviceBuffer<float>          d_lambdas,
+                          VelocityScalingType          prVelocityScalingType,
+                          Float3                       prVelocityScalingMatrixDiagonal,
+                          const DeviceStream&          deviceStream);
+
+
+} // namespace gmx
+
+#endif
diff --git a/src/gromacs/mdlib/leapfrog_gpu_sycl.cpp b/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp

similarity index 65%

rename from src/gromacs/mdlib/leapfrog_gpu_sycl.cpp

rename to src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp

index 89997a69ca7c643b37293ae0cf6bfe45fd45b905..566c92b1b45e0d8512f222dc74de5df5a82218e8 100644 (file)
--- a/src/gromacs/mdlib/leapfrog_gpu_sycl.cpp
+++ b/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp
@@ -36,9 +36,7 @@
   *
   * \brief Implements Leap-Frog using SYCL
   *
   *
   * \brief Implements Leap-Frog using SYCL
   *
- * This file contains implementation of basic Leap-Frog integrator
- * using SYCL, including class initialization, data-structures management
- * and GPU kernel.
+ * This file contains SYCL implementation of back-end specific code for Leap-Frog.
   *
   * \author Artem Zhmurov <zhmurov@gmail.com>
   * \author Andrey Alekseenko <al42and@gmail.com>
   *
   * \author Artem Zhmurov <zhmurov@gmail.com>
   * \author Andrey Alekseenko <al42and@gmail.com>
@@ -47,6 +45,8 @@
   */
  #include "gmxpre.h"
  
   */
  #include "gmxpre.h"
  
+#include "leapfrog_gpu_internal.h"
+
  #include "gromacs/gpu_utils/devicebuffer.h"
  #include "gromacs/gpu_utils/gmxsycl.h"
  #include "gromacs/math/vec.h"
  #include "gromacs/gpu_utils/devicebuffer.h"
  #include "gromacs/gpu_utils/gmxsycl.h"
  #include "gromacs/math/vec.h"
@@ -216,114 +216,38 @@ static inline cl::sycl::event launchLeapFrogKernel(NumTempScaleValues  tempScali
              prVelocityScalingType);
  }
  
              prVelocityScalingType);
  }
  
-void LeapFrogGpu::integrate(DeviceBuffer<Float3>              d_x,
-                            DeviceBuffer<Float3>              d_xp,
-                            DeviceBuffer<Float3>              d_v,
-                            DeviceBuffer<Float3>              d_f,
-                            const real                        dt,
-                            const bool                        doTemperatureScaling,
-                            gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                            const bool                        doParrinelloRahman,
-                            const float                       dtPressureCouple,
-                            const matrix                      prVelocityScalingMatrix)
+void launchLeapFrogKernel(int                                numAtoms,
+                          DeviceBuffer<Float3>               d_x,
+                          DeviceBuffer<Float3>               d_xp,
+                          DeviceBuffer<Float3>               d_v,
+                          const DeviceBuffer<Float3>         d_f,
+                          const DeviceBuffer<float>          d_inverseMasses,
+                          const float                        dt,
+                          const bool                         doTemperatureScaling,
+                          const int                          numTempScaleValues,
+                          const DeviceBuffer<unsigned short> d_tempScaleGroups,
+                          const DeviceBuffer<float>          d_lambdas,
+                          const VelocityScalingType          prVelocityScalingType,
+                          const Float3                       prVelocityScalingMatrixDiagonal,
+                          const DeviceStream&                deviceStream)
  {
  {
-    if (doTemperatureScaling)
-    {
-        GMX_ASSERT(checkDeviceBuffer(d_lambdas_, numTempScaleValues_),
-                   "Number of temperature scaling factors changed since it was set for the "
-                   "last time.");
-        GMX_RELEASE_ASSERT(gmx::ssize(h_lambdas_) == numTempScaleValues_,
-                           "Number of temperature scaling factors changed since it was set for the "
-                           "last time.");
-        /* We could use host accessors here, without h_lambdas_.
-         * According to a quick test, host accessor is slightly faster when using DPC++ and
-         * LevelZero compared to using h_lambdas_ + cgh.copy. But with DPC++ and OpenCL, the host
-         * accessor waits for fReadyOnDevice in UpdateConstrainGpu::Impl::integrate. See #4023. */
-
-        for (int i = 0; i < numTempScaleValues_; i++)
-        {
-            h_lambdas_[i] = tcstat[i].lambda;
-        }
-        copyToDeviceBuffer(&d_lambdas_,
-                           h_lambdas_.data(),
-                           0,
-                           numTempScaleValues_,
-                           deviceStream_,
-                           GpuApiCallBehavior::Async,
-                           nullptr);
-    }
      NumTempScaleValues tempVelocityScalingType =
      NumTempScaleValues tempVelocityScalingType =
-            getTempScalingType(doTemperatureScaling, numTempScaleValues_);
+            getTempScalingType(doTemperatureScaling, numTempScaleValues);
  
  
-    VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
-    if (doParrinelloRahman)
-    {
-        prVelocityScalingType = VelocityScalingType::Diagonal;
-        GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0
-                           && prVelocityScalingMatrix[ZZ][YY] == 0 && prVelocityScalingMatrix[XX][YY] == 0
-                           && prVelocityScalingMatrix[XX][ZZ] == 0 && prVelocityScalingMatrix[YY][ZZ] == 0,
-                   "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
-                   "in GPU version of Leap-Frog integrator.");
-        prVelocityScalingMatrixDiagonal_ = dtPressureCouple
-                                           * Float3{ prVelocityScalingMatrix[XX][XX],
-                                                     prVelocityScalingMatrix[YY][YY],
-                                                     prVelocityScalingMatrix[ZZ][ZZ] };
-    }
  
      launchLeapFrogKernel(tempVelocityScalingType,
                           prVelocityScalingType,
  
      launchLeapFrogKernel(tempVelocityScalingType,
                           prVelocityScalingType,
-                         deviceStream_,
-                         numAtoms_,
+                         deviceStream,
+                         numAtoms,
                           d_x,
                           d_xp,
                           d_v,
                           d_f,
                           d_x,
                           d_xp,
                           d_v,
                           d_f,
-                         d_inverseMasses_,
+                         d_inverseMasses,
                           dt,
                           dt,
-                         d_lambdas_,
-                         d_tempScaleGroups_,
-                         prVelocityScalingMatrixDiagonal_);
-}
-
-LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext,
-                         const DeviceStream&  deviceStream,
-                         const int            numTempScaleValues) :
-    deviceContext_(deviceContext),
-    deviceStream_(deviceStream),
-    numAtoms_(0),
-    numTempScaleValues_(numTempScaleValues)
-{
-    // If the temperature coupling is enabled, we need to make space for scaling factors
-    if (numTempScaleValues_ > 0)
-    {
-        h_lambdas_.resize(numTempScaleValues_);
-        reallocateDeviceBuffer(
-                &d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, deviceContext_);
-    }
-}
-
-LeapFrogGpu::~LeapFrogGpu()
-{
-    freeDeviceBuffer(&d_inverseMasses_);
-}
-
-void LeapFrogGpu::set(const int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups)
-{
-    numAtoms_ = numAtoms;
-
-    reallocateDeviceBuffer(
-            &d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_);
-    copyToDeviceBuffer(
-            &d_inverseMasses_, inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
-
-    // Temperature scale group map only used if there are more then one group
-    if (numTempScaleValues_ > 1)
-    {
-        reallocateDeviceBuffer(
-                &d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_, &numTempScaleGroupsAlloc_, deviceContext_);
-        copyToDeviceBuffer(
-                &d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
-    }
+                         d_lambdas,
+                         d_tempScaleGroups,
+                         prVelocityScalingMatrixDiagonal);
  }
  
  } // namespace gmx
  }
  
  } // namespace gmx
author	Artem Zhmurov <zhmurov@gmail.com>
	Wed, 19 May 2021 10:53:55 +0000 (10:53 +0000)
committer	Artem Zhmurov <zhmurov@gmail.com>
	Wed, 19 May 2021 10:53:55 +0000 (10:53 +0000)
src/gromacs/mdlib/CMakeLists.txt		patch \| blob \| history
src/gromacs/mdlib/leapfrog_gpu.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/leapfrog_gpu.h		patch \| blob \| history
src/gromacs/mdlib/leapfrog_gpu_internal.cu	[moved from src/gromacs/mdlib/leapfrog_gpu.cu with 62% similarity]	patch \| blob \| history
src/gromacs/mdlib/leapfrog_gpu_internal.h	[new file with mode: 0644]	patch \| blob
src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp	[moved from src/gromacs/mdlib/leapfrog_gpu_sycl.cpp with 65% similarity]	patch \| blob \| history