Make common GPU Force Reduction code backend-agnostic
authorAndrey Alekseenko <al42and@gmail.com>
Wed, 5 May 2021 13:05:50 +0000 (13:05 +0000)
committerJoe Jordan <ejjordan12@gmail.com>
Wed, 5 May 2021 13:05:50 +0000 (13:05 +0000)
src/gromacs/mdlib/CMakeLists.txt
src/gromacs/mdlib/gpuforcereduction.h
src/gromacs/mdlib/gpuforcereduction_impl.cpp
src/gromacs/mdlib/gpuforcereduction_impl.cu [deleted file]
src/gromacs/mdlib/gpuforcereduction_impl.h
src/gromacs/mdlib/gpuforcereduction_impl_internal.cu [new file with mode: 0644]
src/gromacs/mdlib/gpuforcereduction_impl_internal.h [new file with mode: 0644]
src/gromacs/mdlib/gpuforcereduction_impl_stubs.cpp [new file with mode: 0644]

index 3f905dedcf25fb15a851e059209ef32636f4510f..060f6160d7ba9dea6d9de1bdb53eb6beb001b449 100644 (file)
@@ -36,8 +36,9 @@
 add_library(mdlib INTERFACE)
 
 file(GLOB MDLIB_SOURCES *.cpp)
-# To avoid listing all the necessary files manually, we will remove SYCL-specfific files here:
+# To avoid listing all the necessary files manually, we will remove SYCL-specific files here:
 list(REMOVE_ITEM MDLIB_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/gpuforcereduction_impl.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/leapfrog_gpu_sycl.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu_internal_sycl.cpp
@@ -48,14 +49,16 @@ set(MDLIB_SOURCES ${MDLIB_SOURCES} PARENT_SCOPE)
 if(GMX_GPU_CUDA)
     gmx_add_libgromacs_sources(
        leapfrog_gpu.cu
+       gpuforcereduction_impl.cpp
+       gpuforcereduction_impl_internal.cu
        lincs_gpu.cpp
        lincs_gpu_internal.cu
        settle_gpu.cpp
        settle_gpu_internal.cu
        update_constrain_gpu_impl.cu
-       gpuforcereduction_impl.cu
        )
     _gmx_add_files_to_property(CUDA_SOURCES
+       gpuforcereduction_impl.cpp
        lincs_gpu.cpp
        settle_gpu.cpp
        )
index 2955dd60c2013e48ac126ec1c97ae7f6ed58065d..e015d7ef5e03c70b4782ee74f7e8b13e622bbaea 100644 (file)
@@ -45,6 +45,8 @@
 
 #include <memory>
 
+#include "config.h"
+
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/math/vectypes.h"
 #include "gromacs/timing/wallcycle.h"
@@ -58,6 +60,8 @@ class DeviceContext;
 namespace gmx
 {
 
+#define HAVE_GPU_FORCE_REDUCTION (GMX_GPU_CUDA)
+
 /*! \internal
  * \brief Manages the force reduction directly in GPU memory
  *
@@ -92,7 +96,7 @@ public:
      *
      * \param [in] forcePtr  Pointer to force to be reduced
      */
-    void registerRvecForce(DeviceBuffer<gmx::RVec> forcePtr);
+    void registerRvecForce(DeviceBuffer<RVec> forcePtr);
 
     /*! \brief Add a dependency for this force reduction
      *
index 6d826d66757c6936e60e8451790debedcf89954d..fb58c5c9437ef48cbadfb5ac4c9592d9b4f006ea 100644 (file)
@@ -34,7 +34,7 @@
  */
 /*! \internal \file
  *
- * \brief May be used to implement force reduction interfaces for non-GPU builds.
+ * \brief Implements backend-agnostic GPU Force Reduction functions
  *
  * \author Alan Gray <alang@nvidia.com>
  *
 
 #include "gmxpre.h"
 
-#include "config.h"
+#include "gpuforcereduction_impl.h"
 
-#include "gpuforcereduction.h"
-
-#if !GMX_GPU_CUDA
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/gpu_utils/devicebuffer.h"
+#if GMX_GPU_CUDA
+#    include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#elif GMX_GPU_SYCL
+#    include "gromacs/gpu_utils/gpueventsynchronizer_sycl.h"
+#endif
+#include "gromacs/mdlib/gpuforcereduction_impl_internal.h"
+#include "gromacs/utility/gmxassert.h"
 
 namespace gmx
 {
 
-class GpuForceReduction::Impl
+GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext,
+                              const DeviceStream&  deviceStream,
+                              gmx_wallcycle*       wcycle) :
+    baseForce_(),
+    deviceContext_(deviceContext),
+    deviceStream_(deviceStream),
+    nbnxmForceToAdd_(),
+    rvecForceToAdd_(),
+    wcycle_(wcycle)
+{
+}
+
+void GpuForceReduction::Impl::reinit(DeviceBuffer<Float3>  baseForcePtr,
+                                     const int             numAtoms,
+                                     ArrayRef<const int>   cell,
+                                     const int             atomStart,
+                                     const bool            accumulate,
+                                     GpuEventSynchronizer* completionMarker)
+{
+    GMX_ASSERT((baseForcePtr != nullptr), "Input base force for reduction has no data");
+    baseForce_        = baseForcePtr;
+    numAtoms_         = numAtoms;
+    atomStart_        = atomStart;
+    accumulate_       = static_cast<int>(accumulate);
+    completionMarker_ = completionMarker;
+    cellInfo_.cell    = cell.data();
+
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    reallocateDeviceBuffer(
+            &cellInfo_.d_cell, numAtoms_, &cellInfo_.cellSize, &cellInfo_.cellSizeAlloc, deviceContext_);
+    copyToDeviceBuffer(&cellInfo_.d_cell,
+                       &(cellInfo_.cell[atomStart]),
+                       0,
+                       numAtoms_,
+                       deviceStream_,
+                       GpuApiCallBehavior::Async,
+                       nullptr);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
+
+    dependencyList_.clear();
+};
+
+void GpuForceReduction::Impl::registerNbnxmForce(DeviceBuffer<RVec> forcePtr)
 {
+    GMX_ASSERT(forcePtr, "Input force for reduction has no data");
+    nbnxmForceToAdd_ = forcePtr;
 };
 
-GpuForceReduction::GpuForceReduction(const DeviceContext& /* deviceContext */,
-                                     const DeviceStream& /* deviceStream */,
-                                     gmx_wallcycle* /*wcycle*/) :
-    impl_(nullptr)
+void GpuForceReduction::Impl::registerRvecForce(DeviceBuffer<RVec> forcePtr)
+{
+    GMX_ASSERT(forcePtr, "Input force for reduction has no data");
+    rvecForceToAdd_ = forcePtr;
+};
+
+void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* const dependency)
+{
+    dependencyList_.push_back(dependency);
+}
+
+void GpuForceReduction::Impl::execute()
 {
-    GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuNBFBufOps);
+
+    if (numAtoms_ == 0)
+    {
+        return;
+    }
+
+    GMX_ASSERT(nbnxmForceToAdd_, "Nbnxm force for reduction has no data");
+
+    // Enqueue wait on all dependencies passed
+    for (auto* synchronizer : dependencyList_)
+    {
+        synchronizer->enqueueWaitEvent(deviceStream_);
+    }
+
+    const bool addRvecForce = static_cast<bool>(rvecForceToAdd_); // True iff initialized
+
+    launchForceReductionKernel(numAtoms_,
+                               atomStart_,
+                               addRvecForce,
+                               accumulate_,
+                               nbnxmForceToAdd_,
+                               rvecForceToAdd_,
+                               baseForce_,
+                               cellInfo_.d_cell,
+                               deviceStream_);
+
+    // Mark that kernel has been launched
+    if (completionMarker_ != nullptr)
+    {
+        completionMarker_->markEvent(deviceStream_);
+    }
+
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuNBFBufOps);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
-// NOLINTNEXTLINE readability-convert-member-functions-to-static
-void GpuForceReduction::reinit(DeviceBuffer<RVec> /*baseForcePtr*/,
-                               const int /*numAtoms*/,
-                               ArrayRef<const int> /*cell*/,
-                               const int /*atomStart*/,
-                               const bool /*accumulate*/,
-                               GpuEventSynchronizer* /*completionMarker*/)
+GpuForceReduction::Impl::~Impl() = default;
+
+GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext,
+                                     const DeviceStream&  deviceStream,
+                                     gmx_wallcycle*       wcycle) :
+    impl_(new Impl(deviceContext, deviceStream, wcycle))
 {
-    GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
 }
 
-// NOLINTNEXTLINE readability-convert-member-functions-to-static
-void GpuForceReduction::registerNbnxmForce(DeviceBuffer<RVec> /* forcePtr */)
+void GpuForceReduction::registerNbnxmForce(DeviceBuffer<RVec> forcePtr)
 {
-    GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+    impl_->registerNbnxmForce(forcePtr);
 }
 
-// NOLINTNEXTLINE readability-convert-member-functions-to-static
-void GpuForceReduction::registerRvecForce(DeviceBuffer<gmx::RVec> /* forcePtr */)
+void GpuForceReduction::registerRvecForce(DeviceBuffer<RVec> forcePtr)
 {
-    GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+    impl_->registerRvecForce(forcePtr);
 }
 
-// NOLINTNEXTLINE readability-convert-member-functions-to-static
-void GpuForceReduction::addDependency(GpuEventSynchronizer* const /* dependency */)
+void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency)
 {
-    GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+    impl_->addDependency(dependency);
 }
 
-// NOLINTNEXTLINE readability-convert-member-functions-to-static
+void GpuForceReduction::reinit(DeviceBuffer<RVec>    baseForcePtr,
+                               const int             numAtoms,
+                               ArrayRef<const int>   cell,
+                               const int             atomStart,
+                               const bool            accumulate,
+                               GpuEventSynchronizer* completionMarker)
+{
+    impl_->reinit(baseForcePtr, numAtoms, cell, atomStart, accumulate, completionMarker);
+}
 void GpuForceReduction::execute()
 {
-    GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+    impl_->execute();
 }
 
 GpuForceReduction::~GpuForceReduction() = default;
 
 } // namespace gmx
-
-#endif
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cu b/src/gromacs/mdlib/gpuforcereduction_impl.cu
deleted file mode 100644 (file)
index 471cc1d..0000000
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2020,2021, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \internal \file
- *
- * \brief Implements GPU Force Reduction using CUDA
- *
- * \author Alan Gray <alang@nvidia.com>
- *
- * \ingroup module_mdlib
- */
-
-#include "gmxpre.h"
-
-#include "gpuforcereduction_impl.h"
-
-#include <stdio.h>
-
-#include "gromacs/gpu_utils/cudautils.cuh"
-#include "gromacs/gpu_utils/device_context.h"
-#include "gromacs/gpu_utils/devicebuffer.h"
-#include "gromacs/gpu_utils/gpu_utils.h"
-#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
-#include "gromacs/gpu_utils/typecasts.cuh"
-#include "gromacs/gpu_utils/vectype_ops.cuh"
-#include "gromacs/utility/gmxassert.h"
-
-#include "gpuforcereduction.h"
-
-namespace gmx
-{
-
-constexpr static int c_threadsPerBlock = 128;
-
-typedef struct rvecDeviceForceData rvecDeviceForceData_t;
-
-
-template<bool addRvecForce, bool accumulateForce>
-static __global__ void reduceKernel(const float3* __restrict__ gm_nbnxmForce,
-                                    const float3* __restrict__ rvecForceToAdd,
-                                    float3*    gm_fTotal,
-                                    const int* gm_cell,
-                                    const int  numAtoms)
-{
-
-    // map particle-level parallelism to 1D CUDA thread and block index
-    const int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
-
-    // perform addition for each particle
-    if (threadIndex < numAtoms)
-    {
-
-        float3* gm_fDest = &gm_fTotal[threadIndex];
-        float3  temp;
-
-        // Accumulate or set nbnxm force
-        if (accumulateForce)
-        {
-            temp = *gm_fDest;
-            temp += gm_nbnxmForce[gm_cell[threadIndex]];
-        }
-        else
-        {
-            temp = gm_nbnxmForce[gm_cell[threadIndex]];
-        }
-
-        if (addRvecForce)
-        {
-            temp += rvecForceToAdd[threadIndex];
-        }
-
-        *gm_fDest = temp;
-    }
-    return;
-}
-
-GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext,
-                              const DeviceStream&  deviceStream,
-                              gmx_wallcycle*       wcycle) :
-    baseForce_(nullptr),
-    deviceContext_(deviceContext),
-    deviceStream_(deviceStream),
-    nbnxmForceToAdd_(nullptr),
-    rvecForceToAdd_(nullptr),
-    wcycle_(wcycle){};
-
-void GpuForceReduction::Impl::reinit(DeviceBuffer<Float3>  baseForcePtr,
-                                     const int             numAtoms,
-                                     ArrayRef<const int>   cell,
-                                     const int             atomStart,
-                                     const bool            accumulate,
-                                     GpuEventSynchronizer* completionMarker)
-{
-    GMX_ASSERT((baseForcePtr != nullptr), "Input base force for reduction has no data");
-    baseForce_        = baseForcePtr;
-    numAtoms_         = numAtoms;
-    atomStart_        = atomStart;
-    accumulate_       = accumulate;
-    completionMarker_ = completionMarker;
-    cellInfo_.cell    = cell.data();
-
-    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
-    reallocateDeviceBuffer(
-            &cellInfo_.d_cell, numAtoms_, &cellInfo_.cellSize, &cellInfo_.cellSizeAlloc, deviceContext_);
-    copyToDeviceBuffer(&cellInfo_.d_cell,
-                       &(cellInfo_.cell[atomStart]),
-                       0,
-                       numAtoms_,
-                       deviceStream_,
-                       GpuApiCallBehavior::Async,
-                       nullptr);
-    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
-
-    dependencyList_.clear();
-};
-
-void GpuForceReduction::Impl::registerNbnxmForce(DeviceBuffer<RVec> forcePtr)
-{
-    GMX_ASSERT((forcePtr != nullptr), "Input force for reduction has no data");
-    nbnxmForceToAdd_ = forcePtr;
-};
-
-void GpuForceReduction::Impl::registerRvecForce(DeviceBuffer<RVec> forcePtr)
-{
-    GMX_ASSERT((forcePtr != nullptr), "Input force for reduction has no data");
-    rvecForceToAdd_ = forcePtr;
-};
-
-void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* const dependency)
-{
-    dependencyList_.push_back(dependency);
-}
-
-void GpuForceReduction::Impl::execute()
-{
-    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
-    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuNBFBufOps);
-
-    if (numAtoms_ == 0)
-    {
-        return;
-    }
-
-    GMX_ASSERT((nbnxmForceToAdd_ != nullptr), "Nbnxm force for reduction has no data");
-
-    // Enqueue wait on all dependencies passed
-    for (auto const synchronizer : dependencyList_)
-    {
-        synchronizer->enqueueWaitEvent(deviceStream_);
-    }
-
-    float3* d_baseForce      = &(asFloat3(baseForce_)[atomStart_]);
-    float3* d_nbnxmForce     = asFloat3(nbnxmForceToAdd_);
-    float3* d_rvecForceToAdd = &(asFloat3(rvecForceToAdd_)[atomStart_]);
-
-    // Configure and launch kernel
-    KernelLaunchConfig config;
-    config.blockSize[0]     = c_threadsPerBlock;
-    config.blockSize[1]     = 1;
-    config.blockSize[2]     = 1;
-    config.gridSize[0]      = ((numAtoms_ + 1) + c_threadsPerBlock - 1) / c_threadsPerBlock;
-    config.gridSize[1]      = 1;
-    config.gridSize[2]      = 1;
-    config.sharedMemorySize = 0;
-
-    auto kernelFn = (rvecForceToAdd_ != nullptr)
-                            ? (accumulate_ ? reduceKernel<true, true> : reduceKernel<true, false>)
-                            : (accumulate_ ? reduceKernel<false, true> : reduceKernel<false, false>);
-
-    const auto kernelArgs = prepareGpuKernelArguments(
-            kernelFn, config, &d_nbnxmForce, &d_rvecForceToAdd, &d_baseForce, &cellInfo_.d_cell, &numAtoms_);
-
-    launchGpuKernel(kernelFn, config, deviceStream_, nullptr, "Force Reduction", kernelArgs);
-
-    // Mark that kernel has been launched
-    if (completionMarker_ != nullptr)
-    {
-        completionMarker_->markEvent(deviceStream_);
-    }
-
-    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuNBFBufOps);
-    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
-}
-
-GpuForceReduction::Impl::~Impl(){};
-
-GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext,
-                                     const DeviceStream&  deviceStream,
-                                     gmx_wallcycle*       wcycle) :
-    impl_(new Impl(deviceContext, deviceStream, wcycle))
-{
-}
-
-void GpuForceReduction::registerNbnxmForce(DeviceBuffer<Float3> forcePtr)
-{
-    impl_->registerNbnxmForce(forcePtr);
-}
-
-void GpuForceReduction::registerRvecForce(DeviceBuffer<gmx::RVec> forcePtr)
-{
-    impl_->registerRvecForce(forcePtr);
-}
-
-void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency)
-{
-    impl_->addDependency(dependency);
-}
-
-void GpuForceReduction::reinit(DeviceBuffer<RVec>    baseForcePtr,
-                               const int             numAtoms,
-                               ArrayRef<const int>   cell,
-                               const int             atomStart,
-                               const bool            accumulate,
-                               GpuEventSynchronizer* completionMarker)
-{
-    impl_->reinit(baseForcePtr, numAtoms, cell, atomStart, accumulate, completionMarker);
-}
-void GpuForceReduction::execute()
-{
-    impl_->execute();
-}
-
-GpuForceReduction::~GpuForceReduction() = default;
-
-} // namespace gmx
index 2f7f3f40ce7f8f51512747f0e88248ffb67dce93..316f4cca317f2d1da0a39a3224b3546767e593f3 100644 (file)
@@ -138,7 +138,7 @@ private:
     DeviceBuffer<RVec> nbnxmForceToAdd_;
     //! Rvec-format force to be added in this reduction
     DeviceBuffer<RVec> rvecForceToAdd_;
-    //! event to be marked when redcution launch has been completed
+    //! event to be marked when reduction launch has been completed
     GpuEventSynchronizer* completionMarker_ = nullptr;
     //! The wallclock counter
     gmx_wallcycle* wcycle_ = nullptr;
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl_internal.cu b/src/gromacs/mdlib/gpuforcereduction_impl_internal.cu
new file mode 100644 (file)
index 0000000..4fb187f
--- /dev/null
@@ -0,0 +1,131 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements GPU Force Reduction using CUDA
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_mdlib
+ */
+
+#include "gmxpre.h"
+
+#include "gpuforcereduction_impl_internal.h"
+
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/typecasts.cuh"
+#include "gromacs/gpu_utils/vectype_ops.cuh"
+
+namespace gmx
+{
+
+constexpr static int c_threadsPerBlock = 128;
+
+template<bool addRvecForce, bool accumulateForce>
+static __global__ void reduceKernel(const float3* __restrict__ gm_nbnxmForce,
+                                    const float3* __restrict__ rvecForceToAdd,
+                                    float3*    gm_fTotal,
+                                    const int* gm_cell,
+                                    const int  numAtoms)
+{
+
+    // map particle-level parallelism to 1D CUDA thread and block index
+    const int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // perform addition for each particle
+    if (threadIndex < numAtoms)
+    {
+
+        float3* gm_fDest = &gm_fTotal[threadIndex];
+        float3  temp;
+
+        // Accumulate or set nbnxm force
+        if (accumulateForce)
+        {
+            temp = *gm_fDest;
+            temp += gm_nbnxmForce[gm_cell[threadIndex]];
+        }
+        else
+        {
+            temp = gm_nbnxmForce[gm_cell[threadIndex]];
+        }
+
+        if (addRvecForce)
+        {
+            temp += rvecForceToAdd[threadIndex];
+        }
+
+        *gm_fDest = temp;
+    }
+    return;
+}
+
+void launchForceReductionKernel(int                        numAtoms,
+                                int                        atomStart,
+                                bool                       addRvecForce,
+                                bool                       accumulate,
+                                const DeviceBuffer<Float3> d_nbnxmForceToAdd,
+                                const DeviceBuffer<Float3> d_rvecForceToAdd,
+                                DeviceBuffer<Float3>       d_baseForce,
+                                DeviceBuffer<int>          d_cell,
+                                const DeviceStream&        deviceStream)
+{
+    float3* d_baseForcePtr      = &(asFloat3(d_baseForce)[atomStart]);
+    float3* d_nbnxmForcePtr     = asFloat3(d_nbnxmForceToAdd);
+    float3* d_rvecForceToAddPtr = &(asFloat3(d_rvecForceToAdd)[atomStart]);
+
+    // Configure and launch kernel
+    KernelLaunchConfig config;
+    config.blockSize[0]     = c_threadsPerBlock;
+    config.blockSize[1]     = 1;
+    config.blockSize[2]     = 1;
+    config.gridSize[0]      = ((numAtoms + 1) + c_threadsPerBlock - 1) / c_threadsPerBlock;
+    config.gridSize[1]      = 1;
+    config.gridSize[2]      = 1;
+    config.sharedMemorySize = 0;
+
+    auto kernelFn = addRvecForce
+                            ? (accumulate ? reduceKernel<true, true> : reduceKernel<true, false>)
+                            : (accumulate ? reduceKernel<false, true> : reduceKernel<false, false>);
+
+    const auto kernelArgs = prepareGpuKernelArguments(
+            kernelFn, config, &d_nbnxmForcePtr, &d_rvecForceToAddPtr, &d_baseForcePtr, &d_cell, &numAtoms);
+
+    launchGpuKernel(kernelFn, config, deviceStream, nullptr, "Force Reduction", kernelArgs);
+}
+
+} // namespace gmx
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl_internal.h b/src/gromacs/mdlib/gpuforcereduction_impl_internal.h
new file mode 100644 (file)
index 0000000..23b87a2
--- /dev/null
@@ -0,0 +1,87 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declares vendor-specific function to launch force reduction kernel
+ *
+ * \author Andrey Alekseenko <al42and@gmail.com>
+ *
+ * \ingroup module_mdlib
+ */
+
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gputraits.h"
+
+class DeviceStream;
+
+namespace gmx
+{
+
+/*! \brief Backend-specific function to launch GPU Force Reduction kernel.
+ *
+ * In pseudocode:
+ *
+ * \code{.cpp}
+ * for (int i = 0; i < numAtoms; i++) {
+ *     totalForce = d_nbnxmForceToAdd[d_cell[i]]
+ *     if (accumulate)
+ *         totalForce += d_baseForce[atomStart + i]
+ *     if (addRvecForce)
+ *         totalForce += d_rvecForceToAdd[atomStart + i]
+ *     d_baseForce[atomStart + i] = totalForce[i]
+ * }
+ * \endcode
+ *
+ * \param numAtoms Number of atoms subject to reduction.
+ * \param atomStart First atom index (for \p d_rvecForceToAdd and \p d_baseForce).
+ * \param addRvecForce When \c false, \p d_rvecForceToAdd is ignored.
+ * \param accumulate When \c false, the previous values of \p d_baseForce are discarded.
+ * \param d_nbnxmForceToAdd Buffer containing Nbnxm forces in Nbnxm layout.
+ * \param d_rvecForceToAdd Optional buffer containing arbitrary forces in linear layout.
+ * \param d_baseForce Destination buffer for forces in linear layout.
+ * \param d_cell Atom index to Nbnxm cell index.
+ * \param deviceStream Device stream for kernel submission.
+ */
+void launchForceReductionKernel(int                  numAtoms,
+                                int                  atomStart,
+                                bool                 addRvecForce,
+                                bool                 accumulate,
+                                DeviceBuffer<Float3> d_nbnxmForceToAdd,
+                                DeviceBuffer<Float3> d_rvecForceToAdd,
+                                DeviceBuffer<Float3> d_baseForce,
+                                DeviceBuffer<int>    d_cell,
+                                const DeviceStream&  deviceStream);
+
+} // namespace gmx
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl_stubs.cpp b/src/gromacs/mdlib/gpuforcereduction_impl_stubs.cpp
new file mode 100644 (file)
index 0000000..7b1fe88
--- /dev/null
@@ -0,0 +1,106 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief May be used to implement force reduction interfaces for non-GPU builds.
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_mdlib
+ */
+
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include "gpuforcereduction.h"
+
+#if !HAVE_GPU_FORCE_REDUCTION
+
+namespace gmx
+{
+
+class GpuForceReduction::Impl
+{
+};
+
+GpuForceReduction::GpuForceReduction(const DeviceContext& /* deviceContext */,
+                                     const DeviceStream& /* deviceStream */,
+                                     gmx_wallcycle* /*wcycle*/) :
+    impl_(nullptr)
+{
+    GMX_RELEASE_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+}
+
+// NOLINTNEXTLINE readability-convert-member-functions-to-static
+void GpuForceReduction::reinit(DeviceBuffer<RVec> /*baseForcePtr*/,
+                               const int /*numAtoms*/,
+                               ArrayRef<const int> /*cell*/,
+                               const int /*atomStart*/,
+                               const bool /*accumulate*/,
+                               GpuEventSynchronizer* /*completionMarker*/)
+{
+    GMX_RELEASE_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+}
+
+// NOLINTNEXTLINE readability-convert-member-functions-to-static
+void GpuForceReduction::registerNbnxmForce(DeviceBuffer<RVec> /* forcePtr */)
+{
+    GMX_RELEASE_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+}
+
+// NOLINTNEXTLINE readability-convert-member-functions-to-static
+void GpuForceReduction::registerRvecForce(DeviceBuffer<gmx::RVec> /* forcePtr */)
+{
+    GMX_RELEASE_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+}
+
+// NOLINTNEXTLINE readability-convert-member-functions-to-static
+void GpuForceReduction::addDependency(GpuEventSynchronizer* const /* dependency */)
+{
+    GMX_RELEASE_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+}
+
+// NOLINTNEXTLINE readability-convert-member-functions-to-static
+void GpuForceReduction::execute()
+{
+    GMX_RELEASE_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+}
+
+GpuForceReduction::~GpuForceReduction() = default;
+
+} // namespace gmx
+
+#endif /* !HAVE_GPU_FORCE_REDUCTION */