/*! \brief
* Return pointer to event recorded when forces are ready
*/
- void* getForcesReadySynchronizer();
+ GpuEventSynchronizer* getForcesReadySynchronizer();
private:
class Impl;
return nullptr;
}
-void* PmePpCommGpu::getForcesReadySynchronizer()
+GpuEventSynchronizer* PmePpCommGpu::getForcesReadySynchronizer()
{
GMX_ASSERT(!impl_,
"A CPU stub for PME-PP GPU communication was called instead of the correct "
return static_cast<void*>(d_pmeForces_);
}
-void* PmePpCommGpu::Impl::getForcesReadySynchronizer()
+GpuEventSynchronizer* PmePpCommGpu::Impl::getForcesReadySynchronizer()
{
- return static_cast<void*>(&forcesReadySynchronizer_);
+ return &forcesReadySynchronizer_;
}
PmePpCommGpu::PmePpCommGpu(MPI_Comm comm,
return impl_->getGpuForceStagingPtr();
}
-void* PmePpCommGpu::getForcesReadySynchronizer()
+GpuEventSynchronizer* PmePpCommGpu::getForcesReadySynchronizer()
{
return impl_->getForcesReadySynchronizer();
}
/*! \brief
* Return pointer to event recorded when forces are ready
*/
- void* getForcesReadySynchronizer();
+ GpuEventSynchronizer* getForcesReadySynchronizer();
private:
//! GPU context handle (not used in CUDA)
lincs_gpu.cu
settle_gpu.cu
update_constrain_gpu_impl.cu
+ gpuforcereduction_impl.cu
)
endif()
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/strconvert.h"
+#include "gpuforcereduction.h"
+
ForceHelperBuffers::ForceHelperBuffers(bool haveDirectVirialContributions) :
haveDirectVirialContributions_(haveDirectVirialContributions)
{
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declares the GPU Force Reduction
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_mdlib
+ */
+#ifndef GMX_MDLIB_GPUFORCEREDUCTION_H
+#define GMX_MDLIB_GPUFORCEREDUCTION_H
+
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/utility/arrayref.h"
+#include "gromacs/utility/fixedcapacityvector.h"
+
+class GpuEventSynchronizer;
+
+namespace gmx
+{
+
+/*! \libinternal
+ * \brief Manages the force reduction directly in GPU memory
+ *
+ * Manages the reduction of multiple GPU force buffers into a single
+ * GPU force buffer. The reduction involves at least one (input/output)
+ * Rvec-format buffer and one (input) Nbat-format buffer, where the
+ * Nbat->Rvec conversion is handled internally. One additional (input)
+ * Rvec-format buffer is supported as optional.
+ */
+class GpuForceReduction
+{
+
+public:
+ /*! \brief Creates GPU force reduction object
+ *
+ * \param [in] deviceContext GPU device context
+ * \param [in] deviceStream Stream to use for reduction
+ */
+ GpuForceReduction(const DeviceContext& deviceContext, const DeviceStream& deviceStream);
+ ~GpuForceReduction();
+
+ /*! \brief Register a nbnxm-format force to be reduced
+ *
+ * \param [in] forcePtr Pointer to force to be reduced
+ */
+ void registerNbnxmForce(void* forcePtr);
+
+ /*! \brief Register a rvec-format force to be reduced
+ *
+ * \param [in] forcePtr Pointer to force to be reduced
+ */
+ void registerRvecForce(void* forcePtr);
+
+ /*! \brief Add a dependency for this force reduction
+ *
+ * \param [in] dependency Dependency for this reduction
+ */
+ void addDependency(GpuEventSynchronizer* dependency);
+
+ /*! \brief Reinitialize the GPU force reduction
+ *
+ * \param [in] baseForcePtr Pointer to force to be used as a base
+ * \param [in] numAtoms The number of atoms
+ * \param [in] cell Pointer to the cell array
+ * \param [in] atomStart The start atom for the reduction
+ * \param [in] accumulate Whether reduction should be accumulated
+ * \param [in] completionMarker Event to be marked when launch of reduction is complete
+ */
+ void reinit(DeviceBuffer<RVec> baseForcePtr,
+ int numAtoms,
+ ArrayRef<const int> cell,
+ int atomStart,
+ bool accumulate,
+ GpuEventSynchronizer* completionMarker = nullptr);
+
+ /*! \brief Execute the force reduction */
+ void execute();
+
+private:
+ class Impl;
+ gmx::PrivateImplPointer<Impl> impl_;
+};
+
+} // namespace gmx
+
+#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief May be used to implement force reduction interfaces for non-GPU builds.
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_mdlib
+ */
+
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include "gpuforcereduction.h"
+
+#if !GMX_GPU_CUDA
+
+namespace gmx
+{
+
+class GpuForceReduction::Impl
+{
+};
+
+GpuForceReduction::GpuForceReduction(const DeviceContext& /* deviceContext */,
+ const DeviceStream& /* deviceStream */) :
+ impl_(nullptr)
+{
+ GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+}
+
+// NOLINTNEXTLINE readability-convert-member-functions-to-static
+void GpuForceReduction::reinit(DeviceBuffer<RVec> /*baseForcePtr*/,
+ const int /*numAtoms*/,
+ ArrayRef<const int> /*cell*/,
+ const int /*atomStart*/,
+ const bool /*accumulate*/,
+ GpuEventSynchronizer* /*completionMarker*/)
+{
+ GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+}
+
+// NOLINTNEXTLINE readability-convert-member-functions-to-static
+void GpuForceReduction::registerNbnxmForce(void* /* forcePtr */)
+{
+ GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+}
+
+// NOLINTNEXTLINE readability-convert-member-functions-to-static
+void GpuForceReduction::registerRvecForce(void* /* forcePtr */)
+{
+ GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+}
+
+// NOLINTNEXTLINE readability-convert-member-functions-to-static
+void GpuForceReduction::addDependency(GpuEventSynchronizer* const /* dependency */)
+{
+ GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+}
+
+// NOLINTNEXTLINE readability-convert-member-functions-to-static
+void GpuForceReduction::execute()
+{
+ GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
+}
+
+GpuForceReduction::~GpuForceReduction() = default;
+
+} // namespace gmx
+
+#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements GPU Force Reduction using CUDA
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_mdlib
+ */
+
+#include "gmxpre.h"
+
+#include "gpuforcereduction_impl.cuh"
+
+#include <stdio.h>
+
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/gpu_utils/typecasts.cuh"
+#include "gromacs/gpu_utils/vectype_ops.cuh"
+#include "gromacs/utility/gmxassert.h"
+
+#include "gpuforcereduction.h"
+
+namespace gmx
+{
+
+constexpr static int c_threadsPerBlock = 128;
+
+typedef struct rvecDeviceForceData rvecDeviceForceData_t;
+
+
+template<bool addRvecForce, bool accumulateForce>
+static __global__ void reduceKernel(const float3* __restrict__ gm_nbnxmForce,
+ const float3* __restrict__ rvecForceToAdd,
+ float3* gm_fTotal,
+ const int* gm_cell,
+ const int numAtoms)
+{
+
+ // map particle-level parallelism to 1D CUDA thread and block index
+ const int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+
+ // perform addition for each particle
+ if (threadIndex < numAtoms)
+ {
+
+ float3* gm_fDest = &gm_fTotal[threadIndex];
+ float3 temp;
+
+ // Accumulate or set nbnxm force
+ if (accumulateForce)
+ {
+ temp = *gm_fDest;
+ temp += gm_nbnxmForce[gm_cell[threadIndex]];
+ }
+ else
+ {
+ temp = gm_nbnxmForce[gm_cell[threadIndex]];
+ }
+
+ if (addRvecForce)
+ {
+ temp += rvecForceToAdd[threadIndex];
+ }
+
+ *gm_fDest = temp;
+ }
+ return;
+}
+
+GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
+ deviceContext_(deviceContext),
+ deviceStream_(deviceStream){};
+
+void GpuForceReduction::Impl::reinit(float3* baseForcePtr,
+ const int numAtoms,
+ ArrayRef<const int> cell,
+ const int atomStart,
+ const bool accumulate,
+ GpuEventSynchronizer* completionMarker)
+{
+ GMX_ASSERT((baseForcePtr != nullptr), "Input base force for reduction has no data");
+ baseForce_ = &(baseForcePtr[atomStart]);
+ numAtoms_ = numAtoms;
+ atomStart_ = atomStart;
+ accumulate_ = accumulate;
+ completionMarker_ = completionMarker;
+ cellInfo_.cell = cell.data();
+ reallocateDeviceBuffer(&cellInfo_.d_cell, numAtoms_, &cellInfo_.cellSize,
+ &cellInfo_.cellSizeAlloc, deviceContext_);
+ copyToDeviceBuffer(&cellInfo_.d_cell, &(cellInfo_.cell[atomStart]), 0, numAtoms_, deviceStream_,
+ GpuApiCallBehavior::Async, nullptr);
+
+ dependencyList_.clear();
+};
+
+void GpuForceReduction::Impl::registerNbnxmForce(DeviceBuffer<RVec> forcePtr)
+{
+ GMX_ASSERT((forcePtr != nullptr), "Input force for reduction has no data");
+ nbnxmForceToAdd_ = forcePtr;
+};
+
+void GpuForceReduction::Impl::registerRvecForce(DeviceBuffer<RVec> forcePtr)
+{
+ GMX_ASSERT((forcePtr != nullptr), "Input force for reduction has no data");
+ rvecForceToAdd_ = forcePtr;
+};
+
+void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* const dependency)
+{
+ dependencyList_.push_back(dependency);
+}
+
+void GpuForceReduction::Impl::execute()
+{
+
+ if (numAtoms_ == 0)
+ {
+ return;
+ }
+
+ GMX_ASSERT((nbnxmForceToAdd_ != nullptr), "Nbnxm force for reduction has no data");
+
+ // Enqueue wait on all dependencies passed
+ for (auto const synchronizer : dependencyList_)
+ {
+ synchronizer->enqueueWaitEvent(deviceStream_);
+ }
+
+ float3* d_nbnxmForce = asFloat3(nbnxmForceToAdd_);
+ float3* d_rvecForceToAdd = &(asFloat3(rvecForceToAdd_)[atomStart_]);
+
+ // Configure and launch kernel
+ KernelLaunchConfig config;
+ config.blockSize[0] = c_threadsPerBlock;
+ config.blockSize[1] = 1;
+ config.blockSize[2] = 1;
+ config.gridSize[0] = ((numAtoms_ + 1) + c_threadsPerBlock - 1) / c_threadsPerBlock;
+ config.gridSize[1] = 1;
+ config.gridSize[2] = 1;
+ config.sharedMemorySize = 0;
+
+ auto kernelFn = (rvecForceToAdd_ != nullptr)
+ ? (accumulate_ ? reduceKernel<true, true> : reduceKernel<true, false>)
+ : (accumulate_ ? reduceKernel<false, true> : reduceKernel<false, false>);
+
+ const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &d_nbnxmForce, &d_rvecForceToAdd,
+ &baseForce_, &cellInfo_.d_cell, &numAtoms_);
+
+ launchGpuKernel(kernelFn, config, deviceStream_, nullptr, "Force Reduction", kernelArgs);
+
+ // Mark that kernel has been launched
+ if (completionMarker_ != nullptr)
+ {
+ completionMarker_->markEvent(deviceStream_);
+ }
+}
+
+GpuForceReduction::Impl::~Impl(){};
+
+GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
+ impl_(new Impl(deviceContext, deviceStream))
+{
+}
+
+void GpuForceReduction::registerNbnxmForce(void* forcePtr)
+{
+ impl_->registerNbnxmForce(reinterpret_cast<DeviceBuffer<RVec>>(forcePtr));
+}
+
+void GpuForceReduction::registerRvecForce(void* forcePtr)
+{
+ impl_->registerRvecForce(reinterpret_cast<DeviceBuffer<RVec>>(forcePtr));
+}
+
+void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency)
+{
+ impl_->addDependency(dependency);
+}
+
+void GpuForceReduction::reinit(DeviceBuffer<RVec> baseForcePtr,
+ const int numAtoms,
+ ArrayRef<const int> cell,
+ const int atomStart,
+ const bool accumulate,
+ GpuEventSynchronizer* completionMarker)
+{
+ impl_->reinit(asFloat3(baseForcePtr), numAtoms, cell, atomStart, accumulate, completionMarker);
+}
+void GpuForceReduction::execute()
+{
+ impl_->execute();
+}
+
+GpuForceReduction::~GpuForceReduction() = default;
+
+} // namespace gmx
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declares the GPU Force Reduction
+ *
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_mdlib
+ */
+#ifndef GMX_MDLIB_GPUFORCEREDUCTION_IMPL_H
+#define GMX_MDLIB_GPUFORCEREDUCTION_IMPL_H
+
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/math/vectypes.h"
+
+#include "gpuforcereduction.h"
+
+namespace gmx
+{
+
+//! structure to hold cell information for any nbat-format forces
+struct cellInfo
+{
+ //! cell index mapping for any nbat-format forces
+ const int* cell = nullptr;
+ //! device copy of cell index mapping for any nbat-format forces
+ int* d_cell = nullptr;
+ //! number of atoms in cell array
+ int cellSize = -1;
+ //! number of atoms allocated in cell array
+ int cellSizeAlloc = -1;
+};
+
+class GpuForceReduction::Impl
+{
+
+public:
+ /*! \brief Creates GPU force reduction object
+ *
+ * \param [in] deviceStream Stream to use for reduction
+ * \param [in] deviceContext GPU device context
+ */
+ Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream);
+ ~Impl();
+
+ /*! \brief Register a nbnxm-format force to be reduced
+ *
+ * \param [in] forcePtr Pointer to force to be reduced
+ */
+ void registerNbnxmForce(DeviceBuffer<RVec> forcePtr);
+
+ /*! \brief Register a rvec-format force to be reduced
+ *
+ * \param [in] forcePtr Pointer to force to be reduced
+ */
+ void registerRvecForce(DeviceBuffer<RVec> forcePtr);
+
+ /*! \brief Add a dependency for this force reduction
+ *
+ * \param [in] dependency Dependency for this reduction
+ */
+ void addDependency(GpuEventSynchronizer* const dependency);
+
+ /*! \brief Reinitialize the GPU force reduction
+ *
+ * \param [in] baseForcePtr Pointer to force to be used as a base
+ * \param [in] numAtoms The number of atoms
+ * \param [in] cell Pointer to the cell array
+ * \param [in] atomStart The start atom for the reduction
+ * \param [in] accumulate Whether reduction should be accumulated
+ * \param [in] completionMarker Event to be marked when launch of reduction is complete
+ */
+ void reinit(float3* baseForcePtr,
+ const int numAtoms,
+ ArrayRef<const int> cell,
+ const int atomStart,
+ const bool accumulate,
+ GpuEventSynchronizer* completionMarker = nullptr);
+
+ /*! \brief Execute the force reduction */
+ void execute();
+
+private:
+ //! force to be used as a base for this reduction
+ float3* baseForce_ = nullptr;
+ //! starting atom
+ int atomStart_ = 0;
+ //! number of atoms
+ int numAtoms_ = 0;
+ //! whether reduction is accumulated into base force buffer
+ int accumulate_ = true;
+ //! cell information for any nbat-format forces
+ struct cellInfo cellInfo_;
+ //! GPU context object
+ const DeviceContext& deviceContext_;
+ //! list of dependencies
+ gmx::FixedCapacityVector<GpuEventSynchronizer*, 3> dependencyList_;
+ //! stream to be used for this reduction
+ const DeviceStream& deviceStream_;
+ //! Nbnxm force to be added in this reduction
+ DeviceBuffer<RVec> nbnxmForceToAdd_ = nullptr;
+ //! Rvec-format force to be added in this reduction
+ DeviceBuffer<RVec> rvecForceToAdd_ = nullptr;
+ //! event to be marked when redcution launch has been completed
+ GpuEventSynchronizer* completionMarker_ = nullptr;
+};
+
+} // namespace gmx
+
+#endif
#include "gromacs/utility/strconvert.h"
#include "gromacs/utility/sysinfo.h"
+#include "gpuforcereduction.h"
+
using gmx::ArrayRef;
using gmx::AtomLocality;
using gmx::DomainLifetimeWorkload;
}
}
+
+/*! \brief Setup for the local and non-local GPU force reductions:
+ * reinitialization plus the registration of forces and dependencies.
+ *
+ * \param [in] runScheduleWork Schedule workload flag structure
+ * \param [in] cr Communication record object
+ * \param [in] fr Force record object
+ * \param [in] ddUsesGpuDirectCommunication Whether GPU direct communication is in use
+ */
+static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
+ const t_commrec* cr,
+ t_forcerec* fr,
+ bool ddUsesGpuDirectCommunication)
+{
+
+ nonbonded_verlet_t* nbv = fr->nbv.get();
+ gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
+
+ // (re-)initialize local GPU force reduction
+ const bool accumulate =
+ runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr);
+ const int atomStart = 0;
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(
+ stateGpu->getForces(), nbv->getNumAtoms(AtomLocality::Local), nbv->getGridIndices(),
+ atomStart, accumulate, stateGpu->fReducedOnDevice());
+
+ // register forces and add dependencies
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(nbv->getGpuForces());
+
+ if (runScheduleWork->simulationWork.useGpuPme
+ && (thisRankHasDuty(cr, DUTY_PME) || runScheduleWork->simulationWork.useGpuPmePpCommunication))
+ {
+ void* forcePtr = thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
+ : // PME force buffer on same GPU
+ fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr);
+
+ GpuEventSynchronizer* const pmeSynchronizer =
+ (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_f_ready_synchronizer(fr->pmedata)
+ : // PME force buffer on same GPU
+ fr->pmePpCommGpu->getForcesReadySynchronizer()); // buffer received from other GPU
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(pmeSynchronizer);
+ }
+
+ if ((runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr))
+ && !ddUsesGpuDirectCommunication)
+ {
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
+ stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::Local, true));
+ }
+
+ if (ddUsesGpuDirectCommunication)
+ {
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
+ cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
+ }
+
+ if (havePPDomainDecomposition(cr))
+ {
+ // (re-)initialize non-local GPU force reduction
+ const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
+ || runScheduleWork->domainWork.haveFreeEnergyWork;
+ const int atomStart = dd_numHomeAtoms(*cr->dd);
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit(
+ stateGpu->getForces(), nbv->getNumAtoms(AtomLocality::NonLocal),
+ nbv->getGridIndices(), atomStart, accumulate);
+
+ // register forces and add dependencies
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->registerNbnxmForce(nbv->getGpuForces());
+ if (runScheduleWork->domainWork.haveCpuBondedWork || runScheduleWork->domainWork.haveFreeEnergyWork)
+ {
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->addDependency(
+ stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::NonLocal, true));
+ }
+ }
+}
+
+
void do_force(FILE* fplog,
const t_commrec* cr,
const gmx_multisim_t* ms,
launchPmeGpuSpread(fr->pmedata, box, stepWork, localXReadyOnDevice, lambda[efptCOUL], wcycle);
}
+ const gmx::DomainLifetimeWorkload& domainWork = runScheduleWork->domainWork;
+
/* do gridding for pair search */
if (stepWork.doNeighborSearch)
{
{
nbv->atomdata_init_copy_x_to_nbat_x_gpu();
}
- // For force buffer ops, we use the below conditon rather than
- // useGpuFBufferOps to ensure that init is performed even if this
- // NS step is also a virial step (on which f buf ops are deactivated).
- if (GMX_GPU_CUDA && simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded)
+
+ if (simulationWork.useGpuBufferOps)
{
- GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
- nbv->atomdata_init_add_nbat_f_to_f_gpu(stateGpu->fReducedOnDevice());
+ setupGpuForceReductions(runScheduleWork, cr, fr, ddUsesGpuDirectCommunication);
}
}
else if (!EI_TPI(inputrec->eI))
}
}
- const gmx::DomainLifetimeWorkload& domainWork = runScheduleWork->domainWork;
-
if (simulationWork.useGpuNonbonded)
{
ddBalanceRegionHandler.openBeforeForceComputationGpu();
if (stepWork.useGpuFBufferOps)
{
- gmx::FixedCapacityVector<GpuEventSynchronizer*, 1> dependencyList;
-
// TODO: move this into DomainLifetimeWorkload, including the second part of the
// condition The bonded and free energy CPU tasks can have non-local force
// contributions which are a dependency for the GPU force reduction.
{
stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(),
AtomLocality::NonLocal);
- dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(
- AtomLocality::NonLocal, stepWork.useGpuFBufferOps));
}
- nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::NonLocal, stateGpu->getForces(),
- pme_gpu_get_device_f(fr->pmedata), dependencyList,
- false, haveNonLocalForceContribInCpuBuffer);
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->execute();
+
if (!useGpuForcesHaloExchange)
{
// copy from GPU input for dd_move_f()
* on the non-alternating path. */
if (useOrEmulateGpuNb && !alternateGpuWait)
{
- // TODO simplify the below conditionals. Pass buffer and sync pointers at init stage rather than here. Unify getter fns for sameGPU/otherGPU cases.
- void* pmeForcePtr =
- stepWork.useGpuPmeFReduction
- ? (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
- : // PME force buffer on same GPU
- fr->pmePpCommGpu->getGpuForceStagingPtr()) // buffer received from other GPU
- : nullptr; // PME reduction not active on GPU
-
- GpuEventSynchronizer* const pmeSynchronizer =
- stepWork.useGpuPmeFReduction
- ? (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_f_ready_synchronizer(fr->pmedata)
- : // PME force buffer on same GPU
- static_cast<GpuEventSynchronizer*>(
- fr->pmePpCommGpu->getForcesReadySynchronizer())) // buffer received from other GPU
- : nullptr; // PME reduction not active on GPU
-
- gmx::FixedCapacityVector<GpuEventSynchronizer*, 3> dependencyList;
-
- if (stepWork.useGpuPmeFReduction)
- {
- dependencyList.push_back(pmeSynchronizer);
- }
-
gmx::ArrayRef<gmx::RVec> forceWithShift = forceOut.forceWithShiftForces().force();
if (stepWork.useGpuFBufferOps)
auto locality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
stateGpu->copyForcesToGpu(forceWithShift, locality);
- dependencyList.push_back(
- stateGpu->getForcesReadyOnDeviceEvent(locality, stepWork.useGpuFBufferOps));
- }
- if (useGpuForcesHaloExchange)
- {
- dependencyList.push_back(cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
}
- nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr,
- dependencyList, stepWork.useGpuPmeFReduction,
- haveLocalForceContribInCpuBuffer);
+
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->execute();
+
// Copy forces to host if they are needed for update or if virtual sites are enabled.
// If there are vsites, we need to copy forces every step to spread vsite forces on host.
// TODO: When the output flags will be included in step workload, this copy can be combined with the
#include "gromacs/mdlib/force.h"
#include "gromacs/mdlib/forcerec.h"
#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/gpuforcereduction.h"
#include "gromacs/mdlib/makeconstraints.h"
#include "gromacs/mdlib/md_support.h"
#include "gromacs/mdlib/mdatoms.h"
domdecOptions.checkBondedInteractions, fr->cginfo_mb);
}
+ if (runScheduleWork.simulationWork.useGpuBufferOps)
+ {
+ fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
+ deviceStreamManager->context(),
+ deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal));
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique<gmx::GpuForceReduction>(
+ deviceStreamManager->context(),
+ deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal));
+ }
+
std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
if (gpusWereDetected
&& ((runScheduleWork.simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME))
#include "gromacs/utility/basedefinitions.h"
#include "gromacs/utility/real.h"
+#include "locality.h"
+
/* Abstract type for PME that is defined only in the routine that use them. */
struct gmx_pme_t;
struct nonbonded_verlet_t;
{
class DeviceStreamManager;
class GpuBonded;
+class GpuForceReduction;
class ForceProviders;
class StatePropagatorDataGpu;
class PmePpCommGpu;
/* For PME-PP GPU communication */
std::unique_ptr<gmx::PmePpCommGpu> pmePpCommGpu;
+
+ /* For GPU force reduction (on both local and non-local atoms) */
+ gmx::EnumerationArray<gmx::AtomLocality, std::unique_ptr<gmx::GpuForceReduction>> gpuForceReduction;
};
/* Important: Starting with Gromacs-4.6, the values of c6 and c12 in the nbfp array have
}
}
-/* Add the force array(s) from nbnxn_atomdata_t to f */
-void reduceForcesGpu(const gmx::AtomLocality locality,
- DeviceBuffer<RVec> totalForcesDevice,
- const Nbnxm::GridSet& gridSet,
- void* pmeForcesDevice,
- gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
- NbnxmGpu* gpu_nbv,
- bool useGpuFPmeReduction,
- bool accumulateForce)
-{
- int atomsStart = 0;
- int numAtoms = 0;
-
- nbnxn_get_atom_range(locality, gridSet, &atomsStart, &numAtoms);
-
- if (numAtoms == 0)
- {
- /* The are no atoms for this reduction, avoid some overhead */
- return;
- }
-
- Nbnxm::nbnxn_gpu_add_nbat_f_to_f(locality, totalForcesDevice, gpu_nbv, pmeForcesDevice, dependencyList,
- atomsStart, numAtoms, useGpuFPmeReduction, accumulateForce);
-}
-
void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t& nbat, gmx::ArrayRef<gmx::RVec> fshift)
{
gmx::ArrayRef<const nbnxn_atomdata_output_t> outputBuffers = nbat.out;
*/
void reduceForces(nbnxn_atomdata_t* nbat, gmx::AtomLocality locality, const Nbnxm::GridSet& gridSet, rvec* totalForce);
-/*! \brief Reduce forces on the GPU
- *
- * \param[in] locality If the reduction should be performed on local or non-local atoms.
- * \param[out] totalForcesDevice Device buffer to accumulate resulting force.
- * \param[in] gridSet The grids data.
- * \param[in] pmeForcesDevice Device buffer with PME forces.
- * \param[in] dependencyList List of synchronizers that represent the dependencies the reduction task needs to sync on.
- * \param[in] gpu_nbv The NBNXM GPU data structure.
- * \param[in] useGpuFPmeReduction Whether PME forces should be added.
- * \param[in] accumulateForce Whether there are usefull data already in the total force buffer.
- */
-void reduceForcesGpu(gmx::AtomLocality locality,
- DeviceBuffer<gmx::RVec> totalForcesDevice,
- const Nbnxm::GridSet& gridSet,
- void* pmeForcesDevice,
- gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
- NbnxmGpu* gpu_nbv,
- bool useGpuFPmeReduction,
- bool accumulateForce);
-
//! Add the fshift force stored in nbat to fshift
void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t& nbat, gmx::ArrayRef<gmx::RVec> fshift);
nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
}
-/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format.
- *
- * NOTE: When the total force device buffer is reallocated and its size increases, it is cleared in
- * Local stream. Hence, if accumulateForce is true, NonLocal stream should start accumulating
- * forces only after Local stream already done so.
- */
-void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality,
- DeviceBuffer<gmx::RVec> totalForcesDevice,
- NbnxmGpu* nb,
- void* pmeForcesDevice,
- gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
- int atomStart,
- int numAtoms,
- bool useGpuFPmeReduction,
- bool accumulateForce)
+void* getGpuForces(NbnxmGpu* nb)
{
- GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
- GMX_ASSERT(numAtoms != 0, "Cannot call function with no atoms");
- GMX_ASSERT(totalForcesDevice, "Need a valid totalForcesDevice pointer");
-
- const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
- const DeviceStream& deviceStream = *nb->deviceStreams[iLocality];
- cu_atomdata_t* adat = nb->atdat;
-
- size_t gmx_used_in_debug numDependency = static_cast<size_t>((useGpuFPmeReduction == true))
- + static_cast<size_t>((accumulateForce == true));
- GMX_ASSERT(numDependency >= dependencyList.size(),
- "Mismatching number of dependencies and call signature");
-
- // Enqueue wait on all dependencies passed
- for (auto const synchronizer : dependencyList)
- {
- synchronizer->enqueueWaitEvent(deviceStream);
- }
-
- /* launch kernel */
-
- KernelLaunchConfig config;
- config.blockSize[0] = c_bufOpsThreadsPerBlock;
- config.blockSize[1] = 1;
- config.blockSize[2] = 1;
- config.gridSize[0] = ((numAtoms + 1) + c_bufOpsThreadsPerBlock - 1) / c_bufOpsThreadsPerBlock;
- config.gridSize[1] = 1;
- config.gridSize[2] = 1;
- config.sharedMemorySize = 0;
-
- auto kernelFn = accumulateForce ? nbnxn_gpu_add_nbat_f_to_f_kernel<true, false>
- : nbnxn_gpu_add_nbat_f_to_f_kernel<false, false>;
-
- if (useGpuFPmeReduction)
- {
- GMX_ASSERT(pmeForcesDevice, "Need a valid pmeForcesDevice pointer");
- kernelFn = accumulateForce ? nbnxn_gpu_add_nbat_f_to_f_kernel<true, true>
- : nbnxn_gpu_add_nbat_f_to_f_kernel<false, true>;
- }
-
- const float3* d_fNB = adat->f;
- const float3* d_fPme = static_cast<float3*>(pmeForcesDevice);
- float3* d_fTotal = asFloat3(totalForcesDevice);
- const int* d_cell = nb->cell;
-
- const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &d_fNB, &d_fPme, &d_fTotal,
- &d_cell, &atomStart, &numAtoms);
-
- launchGpuKernel(kernelFn, config, deviceStream, nullptr, "FbufferOps", kernelArgs);
-
- if (atomLocality == AtomLocality::Local)
- {
- GMX_ASSERT(nb->localFReductionDone != nullptr,
- "localFReductionDone has to be a valid pointer");
- nb->localFReductionDone->markEvent(deviceStream);
- }
+ return nb->atdat->f;
}
} // namespace Nbnxm
nb->ncxy_na_alloc = 0;
nb->ncxy_ind = 0;
nb->ncxy_ind_alloc = 0;
- nb->ncell = 0;
- nb->ncell_alloc = 0;
if (debug)
{
return;
}
-/* Initialization for F buffer operations on GPU. */
-void nbnxn_gpu_init_add_nbat_f_to_f(const int* cell,
- NbnxmGpu* gpu_nbv,
- int natoms_total,
- GpuEventSynchronizer* const localReductionDone)
-{
-
- const DeviceStream& deviceStream = *gpu_nbv->deviceStreams[InteractionLocality::Local];
-
- GMX_ASSERT(localReductionDone, "localReductionDone should be a valid pointer");
- gpu_nbv->localFReductionDone = localReductionDone;
-
- if (natoms_total > 0)
- {
- reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc,
- *gpu_nbv->deviceContext_);
- copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, deviceStream,
- GpuApiCallBehavior::Async, nullptr);
- }
-
- return;
-}
-
} // namespace Nbnxm
bool bUseTwoStreams = false;
/*! \brief atom data */
cu_atomdata_t* atdat = nullptr;
- /*! \brief f buf ops cell index mapping */
- int* cell = nullptr;
- /*! \brief number of indices in cell buffer */
- int ncell = 0;
- /*! \brief number of indices allocated in cell buffer */
- int ncell_alloc = 0;
/*! \brief array of atom indices */
int* atomIndices = nullptr;
/*! \brief size of atom indices */
* will be true. */
gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork = { { false } };
- /*! \brief Pointer to event synchronizer triggered when the local
- * GPU buffer ops / reduction is complete
- *
- * \note That the synchronizer is managed outside of this module
- * in StatePropagatorDataGpu.
- */
- GpuEventSynchronizer* localFReductionDone = nullptr;
-
/*! \brief Event triggered when non-local coordinate buffer
* has been copied from device to host. */
GpuEventSynchronizer* xNonLocalCopyD2HDone = nullptr;
wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
}
-void nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const gmx::AtomLocality locality,
- DeviceBuffer<gmx::RVec> totalForcesDevice,
- void* forcesPmeDevice,
- gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
- bool useGpuFPmeReduction,
- bool accumulateForce)
+int nonbonded_verlet_t::getNumAtoms(const gmx::AtomLocality locality)
{
-
- GMX_ASSERT((useGpuFPmeReduction == (forcesPmeDevice != nullptr)),
- "GPU PME force reduction is only valid when a non-null GPU PME force pointer is "
- "available");
-
- /* Skip the reduction if there was no short-range GPU work to do
- * (either NB or both NB and bonded work). */
- if (!pairlistIsSimple() && !Nbnxm::haveGpuShortRangeWork(gpu_nbv, locality))
+ int numAtoms = 0;
+ switch (locality)
{
- return;
+ case gmx::AtomLocality::All: numAtoms = pairSearch_->gridSet().numRealAtomsTotal(); break;
+ case gmx::AtomLocality::Local: numAtoms = pairSearch_->gridSet().numRealAtomsLocal(); break;
+ case gmx::AtomLocality::NonLocal:
+ numAtoms = pairSearch_->gridSet().numRealAtomsTotal()
+ - pairSearch_->gridSet().numRealAtomsLocal();
+ break;
+ case gmx::AtomLocality::Count:
+ GMX_ASSERT(false, "Count is invalid locality specifier");
+ break;
}
-
- wallcycle_start(wcycle_, ewcLAUNCH_GPU);
- wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
-
- reduceForcesGpu(locality, totalForcesDevice, pairSearch_->gridSet(), forcesPmeDevice,
- dependencyList, gpu_nbv, useGpuFPmeReduction, accumulateForce);
-
- wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
- wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+ return numAtoms;
}
-void nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* const localReductionDone)
+void* nonbonded_verlet_t::getGpuForces()
{
-
- wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
- wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
-
- const Nbnxm::GridSet& gridSet = pairSearch_->gridSet();
-
- Nbnxm::nbnxn_gpu_init_add_nbat_f_to_f(gridSet.cells().data(), gpu_nbv,
- gridSet.numRealAtomsTotal(), localReductionDone);
-
- wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
- wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
+ return Nbnxm::getGpuForces(gpu_nbv);
}
real nonbonded_verlet_t::pairlistInnerRadius() const
bool useGpuFPmeReduction,
bool accumulateForce);
- /*! \brief Outer body of function to perform initialization for F buffer operations on GPU.
+ /*! \brief Get the number of atoms for a given locality
*
- * \param localReductionDone Pointer to an event synchronizer that marks the completion of the local f buffer ops kernel.
+ * \param [in] locality Local or non-local
+ * \returns The number of atoms for given locality
*/
- void atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* localReductionDone);
+ int getNumAtoms(gmx::AtomLocality locality);
- /*! \brief return GPU pointer to f in rvec format */
- void* get_gpu_frvec();
+ /*! \brief Get the pointer to the GPU nonbonded force buffer
+ *
+ * \returns A pointer to the force buffer in GPU memory
+ */
+ void* getGpuForces();
//! Return the kernel setup
const Nbnxm::KernelSetup& kernelSetup() const { return kernelSetup_; }
bool haveGpuShortRangeWork(const NbnxmGpu gmx_unused* nb, gmx::AtomLocality gmx_unused aLocality)
GPU_FUNC_TERM_WITH_RETURN(false);
-/*! \brief Initialization for F buffer operations on GPU */
-CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused* cell,
- NbnxmGpu gmx_unused* gpu_nbv,
- int gmx_unused natoms_total,
- GpuEventSynchronizer gmx_unused* localReductionDone) CUDA_FUNC_TERM;
-
-/*! \brief Force buffer operations on GPU.
- *
- * Transforms non-bonded forces into plain rvec format and add all the force components to the total
- * force buffer
- *
- * \param[in] atomLocality If the reduction should be performed on local or non-local atoms.
- * \param[in] totalForcesDevice Device buffer to accumulate resulting force.
- * \param[in] gpu_nbv The NBNXM GPU data structure.
- * \param[in] pmeForcesDevice Device buffer with PME forces.
- * \param[in] dependencyList List of synchronizers that represent the dependencies the reduction task needs to sync on.
- * \param[in] atomStart Index of the first atom to reduce forces for.
- * \param[in] numAtoms Number of atoms to reduce forces for.
- * \param[in] useGpuFPmeReduction Whether PME forces should be added.
- * \param[in] accumulateForce Whether there are usefull data already in the total force buffer.
- *
- */
-CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_add_nbat_f_to_f(gmx::AtomLocality gmx_unused atomLocality,
- DeviceBuffer<gmx::RVec> gmx_unused totalForcesDevice,
- NbnxmGpu gmx_unused* gpu_nbv,
- void gmx_unused* pmeForcesDevice,
- gmx::ArrayRef<GpuEventSynchronizer* const> gmx_unused dependencyList,
- int gmx_unused atomStart,
- int gmx_unused numAtoms,
- bool gmx_unused useGpuFPmeReduction,
- bool gmx_unused accumulateForce) CUDA_FUNC_TERM;
-
/*! \brief sync CPU thread on coordinate copy to device
* \param[in] nb The nonbonded data GPU structure
*/
CUDA_FUNC_QUALIFIER
void nbnxn_wait_x_on_device(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM;
+/*! \brief Get the pointer to the GPU nonbonded force buffer
+ *
+ * \param[in] nb The nonbonded data GPU structure
+ * \returns A pointer to the force buffer in GPU memory
+ */
+CUDA_FUNC_QUALIFIER
+void* getGpuForces(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
+
} // namespace Nbnxm
#endif