From 257d8094c09479fe3f7bd9386013056453f26455 Mon Sep 17 00:00:00 2001 From: Alan Gray Date: Tue, 29 Sep 2020 11:27:18 +0000 Subject: [PATCH] Redevelopment of GPU Force Reduction/Buffer Ops Introduces a new purpose-build class for GPU force reduction, which replaces the previous force buffer ops mechanism. Refs #3370 --- src/gromacs/ewald/pme_pp_comm_gpu.h | 2 +- src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp | 2 +- src/gromacs/ewald/pme_pp_comm_gpu_impl.cu | 6 +- src/gromacs/ewald/pme_pp_comm_gpu_impl.h | 2 +- src/gromacs/mdlib/CMakeLists.txt | 1 + src/gromacs/mdlib/forcerec.cpp | 2 + src/gromacs/mdlib/gpuforcereduction.h | 122 +++++++++ src/gromacs/mdlib/gpuforcereduction_impl.cpp | 105 ++++++++ src/gromacs/mdlib/gpuforcereduction_impl.cu | 234 ++++++++++++++++++ src/gromacs/mdlib/gpuforcereduction_impl.cuh | 144 +++++++++++ src/gromacs/mdlib/sim_util.cpp | 137 ++++++---- src/gromacs/mdrun/runner.cpp | 11 + src/gromacs/mdtypes/forcerec.h | 6 + src/gromacs/nbnxm/atomdata.cpp | 25 -- src/gromacs/nbnxm/atomdata.h | 20 -- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 73 +----- .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 25 -- src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h | 14 -- src/gromacs/nbnxm/nbnxm.cpp | 51 ++-- src/gromacs/nbnxm/nbnxm.h | 14 +- src/gromacs/nbnxm/nbnxm_gpu.h | 42 +--- 21 files changed, 755 insertions(+), 283 deletions(-) create mode 100644 src/gromacs/mdlib/gpuforcereduction.h create mode 100644 src/gromacs/mdlib/gpuforcereduction_impl.cpp create mode 100644 src/gromacs/mdlib/gpuforcereduction_impl.cu create mode 100644 src/gromacs/mdlib/gpuforcereduction_impl.cuh diff --git a/src/gromacs/ewald/pme_pp_comm_gpu.h b/src/gromacs/ewald/pme_pp_comm_gpu.h index 97accca871..a3e0239e58 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu.h +++ b/src/gromacs/ewald/pme_pp_comm_gpu.h @@ -103,7 +103,7 @@ public: /*! \brief * Return pointer to event recorded when forces are ready */ - void* getForcesReadySynchronizer(); + GpuEventSynchronizer* getForcesReadySynchronizer(); private: class Impl; diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp index 0259cd0229..d19004e7ed 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp @@ -110,7 +110,7 @@ void* PmePpCommGpu::getGpuForceStagingPtr() return nullptr; } -void* PmePpCommGpu::getForcesReadySynchronizer() +GpuEventSynchronizer* PmePpCommGpu::getForcesReadySynchronizer() { GMX_ASSERT(!impl_, "A CPU stub for PME-PP GPU communication was called instead of the correct " diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu index 2c6f696ddd..acb5998b02 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu @@ -154,9 +154,9 @@ void* PmePpCommGpu::Impl::getGpuForceStagingPtr() return static_cast(d_pmeForces_); } -void* PmePpCommGpu::Impl::getForcesReadySynchronizer() +GpuEventSynchronizer* PmePpCommGpu::Impl::getForcesReadySynchronizer() { - return static_cast(&forcesReadySynchronizer_); + return &forcesReadySynchronizer_; } PmePpCommGpu::PmePpCommGpu(MPI_Comm comm, @@ -193,7 +193,7 @@ void* PmePpCommGpu::getGpuForceStagingPtr() return impl_->getGpuForceStagingPtr(); } -void* PmePpCommGpu::getForcesReadySynchronizer() +GpuEventSynchronizer* PmePpCommGpu::getForcesReadySynchronizer() { return impl_->getForcesReadySynchronizer(); } diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h index 4c95d9bccd..0630084e59 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h @@ -115,7 +115,7 @@ public: /*! \brief * Return pointer to event recorded when forces are ready */ - void* getForcesReadySynchronizer(); + GpuEventSynchronizer* getForcesReadySynchronizer(); private: //! GPU context handle (not used in CUDA) diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt index c279548147..6cb02ee517 100644 --- a/src/gromacs/mdlib/CMakeLists.txt +++ b/src/gromacs/mdlib/CMakeLists.txt @@ -45,6 +45,7 @@ if(GMX_GPU_CUDA) lincs_gpu.cu settle_gpu.cu update_constrain_gpu_impl.cu + gpuforcereduction_impl.cu ) endif() diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp index 34c10d3001..6144b3a2fa 100644 --- a/src/gromacs/mdlib/forcerec.cpp +++ b/src/gromacs/mdlib/forcerec.cpp @@ -100,6 +100,8 @@ #include "gromacs/utility/smalloc.h" #include "gromacs/utility/strconvert.h" +#include "gpuforcereduction.h" + ForceHelperBuffers::ForceHelperBuffers(bool haveDirectVirialContributions) : haveDirectVirialContributions_(haveDirectVirialContributions) { diff --git a/src/gromacs/mdlib/gpuforcereduction.h b/src/gromacs/mdlib/gpuforcereduction.h new file mode 100644 index 0000000000..de8aebbf76 --- /dev/null +++ b/src/gromacs/mdlib/gpuforcereduction.h @@ -0,0 +1,122 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Declares the GPU Force Reduction + * + * \author Alan Gray + * + * \ingroup module_mdlib + */ +#ifndef GMX_MDLIB_GPUFORCEREDUCTION_H +#define GMX_MDLIB_GPUFORCEREDUCTION_H + +#include "gromacs/gpu_utils/device_stream.h" +#include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gromacs/math/vectypes.h" +#include "gromacs/utility/arrayref.h" +#include "gromacs/utility/fixedcapacityvector.h" + +class GpuEventSynchronizer; + +namespace gmx +{ + +/*! \libinternal + * \brief Manages the force reduction directly in GPU memory + * + * Manages the reduction of multiple GPU force buffers into a single + * GPU force buffer. The reduction involves at least one (input/output) + * Rvec-format buffer and one (input) Nbat-format buffer, where the + * Nbat->Rvec conversion is handled internally. One additional (input) + * Rvec-format buffer is supported as optional. + */ +class GpuForceReduction +{ + +public: + /*! \brief Creates GPU force reduction object + * + * \param [in] deviceContext GPU device context + * \param [in] deviceStream Stream to use for reduction + */ + GpuForceReduction(const DeviceContext& deviceContext, const DeviceStream& deviceStream); + ~GpuForceReduction(); + + /*! \brief Register a nbnxm-format force to be reduced + * + * \param [in] forcePtr Pointer to force to be reduced + */ + void registerNbnxmForce(void* forcePtr); + + /*! \brief Register a rvec-format force to be reduced + * + * \param [in] forcePtr Pointer to force to be reduced + */ + void registerRvecForce(void* forcePtr); + + /*! \brief Add a dependency for this force reduction + * + * \param [in] dependency Dependency for this reduction + */ + void addDependency(GpuEventSynchronizer* dependency); + + /*! \brief Reinitialize the GPU force reduction + * + * \param [in] baseForcePtr Pointer to force to be used as a base + * \param [in] numAtoms The number of atoms + * \param [in] cell Pointer to the cell array + * \param [in] atomStart The start atom for the reduction + * \param [in] accumulate Whether reduction should be accumulated + * \param [in] completionMarker Event to be marked when launch of reduction is complete + */ + void reinit(DeviceBuffer baseForcePtr, + int numAtoms, + ArrayRef cell, + int atomStart, + bool accumulate, + GpuEventSynchronizer* completionMarker = nullptr); + + /*! \brief Execute the force reduction */ + void execute(); + +private: + class Impl; + gmx::PrivateImplPointer impl_; +}; + +} // namespace gmx + +#endif diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl.cpp new file mode 100644 index 0000000000..ff32deef96 --- /dev/null +++ b/src/gromacs/mdlib/gpuforcereduction_impl.cpp @@ -0,0 +1,105 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief May be used to implement force reduction interfaces for non-GPU builds. + * + * \author Alan Gray + * + * \ingroup module_mdlib + */ + +#include "gmxpre.h" + +#include "config.h" + +#include "gpuforcereduction.h" + +#if !GMX_GPU_CUDA + +namespace gmx +{ + +class GpuForceReduction::Impl +{ +}; + +GpuForceReduction::GpuForceReduction(const DeviceContext& /* deviceContext */, + const DeviceStream& /* deviceStream */) : + impl_(nullptr) +{ + GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation."); +} + +// NOLINTNEXTLINE readability-convert-member-functions-to-static +void GpuForceReduction::reinit(DeviceBuffer /*baseForcePtr*/, + const int /*numAtoms*/, + ArrayRef /*cell*/, + const int /*atomStart*/, + const bool /*accumulate*/, + GpuEventSynchronizer* /*completionMarker*/) +{ + GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation."); +} + +// NOLINTNEXTLINE readability-convert-member-functions-to-static +void GpuForceReduction::registerNbnxmForce(void* /* forcePtr */) +{ + GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation."); +} + +// NOLINTNEXTLINE readability-convert-member-functions-to-static +void GpuForceReduction::registerRvecForce(void* /* forcePtr */) +{ + GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation."); +} + +// NOLINTNEXTLINE readability-convert-member-functions-to-static +void GpuForceReduction::addDependency(GpuEventSynchronizer* const /* dependency */) +{ + GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation."); +} + +// NOLINTNEXTLINE readability-convert-member-functions-to-static +void GpuForceReduction::execute() +{ + GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation."); +} + +GpuForceReduction::~GpuForceReduction() = default; + +} // namespace gmx + +#endif diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cu b/src/gromacs/mdlib/gpuforcereduction_impl.cu new file mode 100644 index 0000000000..29c18daf33 --- /dev/null +++ b/src/gromacs/mdlib/gpuforcereduction_impl.cu @@ -0,0 +1,234 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Implements GPU Force Reduction using CUDA + * + * \author Alan Gray + * + * \ingroup module_mdlib + */ + +#include "gmxpre.h" + +#include "gpuforcereduction_impl.cuh" + +#include + +#include "gromacs/gpu_utils/cudautils.cuh" +#include "gromacs/gpu_utils/device_context.h" +#include "gromacs/gpu_utils/devicebuffer.h" +#include "gromacs/gpu_utils/gpu_utils.h" +#include "gromacs/gpu_utils/gpueventsynchronizer.cuh" +#include "gromacs/gpu_utils/typecasts.cuh" +#include "gromacs/gpu_utils/vectype_ops.cuh" +#include "gromacs/utility/gmxassert.h" + +#include "gpuforcereduction.h" + +namespace gmx +{ + +constexpr static int c_threadsPerBlock = 128; + +typedef struct rvecDeviceForceData rvecDeviceForceData_t; + + +template +static __global__ void reduceKernel(const float3* __restrict__ gm_nbnxmForce, + const float3* __restrict__ rvecForceToAdd, + float3* gm_fTotal, + const int* gm_cell, + const int numAtoms) +{ + + // map particle-level parallelism to 1D CUDA thread and block index + const int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; + + // perform addition for each particle + if (threadIndex < numAtoms) + { + + float3* gm_fDest = &gm_fTotal[threadIndex]; + float3 temp; + + // Accumulate or set nbnxm force + if (accumulateForce) + { + temp = *gm_fDest; + temp += gm_nbnxmForce[gm_cell[threadIndex]]; + } + else + { + temp = gm_nbnxmForce[gm_cell[threadIndex]]; + } + + if (addRvecForce) + { + temp += rvecForceToAdd[threadIndex]; + } + + *gm_fDest = temp; + } + return; +} + +GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream) : + deviceContext_(deviceContext), + deviceStream_(deviceStream){}; + +void GpuForceReduction::Impl::reinit(float3* baseForcePtr, + const int numAtoms, + ArrayRef cell, + const int atomStart, + const bool accumulate, + GpuEventSynchronizer* completionMarker) +{ + GMX_ASSERT((baseForcePtr != nullptr), "Input base force for reduction has no data"); + baseForce_ = &(baseForcePtr[atomStart]); + numAtoms_ = numAtoms; + atomStart_ = atomStart; + accumulate_ = accumulate; + completionMarker_ = completionMarker; + cellInfo_.cell = cell.data(); + reallocateDeviceBuffer(&cellInfo_.d_cell, numAtoms_, &cellInfo_.cellSize, + &cellInfo_.cellSizeAlloc, deviceContext_); + copyToDeviceBuffer(&cellInfo_.d_cell, &(cellInfo_.cell[atomStart]), 0, numAtoms_, deviceStream_, + GpuApiCallBehavior::Async, nullptr); + + dependencyList_.clear(); +}; + +void GpuForceReduction::Impl::registerNbnxmForce(DeviceBuffer forcePtr) +{ + GMX_ASSERT((forcePtr != nullptr), "Input force for reduction has no data"); + nbnxmForceToAdd_ = forcePtr; +}; + +void GpuForceReduction::Impl::registerRvecForce(DeviceBuffer forcePtr) +{ + GMX_ASSERT((forcePtr != nullptr), "Input force for reduction has no data"); + rvecForceToAdd_ = forcePtr; +}; + +void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* const dependency) +{ + dependencyList_.push_back(dependency); +} + +void GpuForceReduction::Impl::execute() +{ + + if (numAtoms_ == 0) + { + return; + } + + GMX_ASSERT((nbnxmForceToAdd_ != nullptr), "Nbnxm force for reduction has no data"); + + // Enqueue wait on all dependencies passed + for (auto const synchronizer : dependencyList_) + { + synchronizer->enqueueWaitEvent(deviceStream_); + } + + float3* d_nbnxmForce = asFloat3(nbnxmForceToAdd_); + float3* d_rvecForceToAdd = &(asFloat3(rvecForceToAdd_)[atomStart_]); + + // Configure and launch kernel + KernelLaunchConfig config; + config.blockSize[0] = c_threadsPerBlock; + config.blockSize[1] = 1; + config.blockSize[2] = 1; + config.gridSize[0] = ((numAtoms_ + 1) + c_threadsPerBlock - 1) / c_threadsPerBlock; + config.gridSize[1] = 1; + config.gridSize[2] = 1; + config.sharedMemorySize = 0; + + auto kernelFn = (rvecForceToAdd_ != nullptr) + ? (accumulate_ ? reduceKernel : reduceKernel) + : (accumulate_ ? reduceKernel : reduceKernel); + + const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &d_nbnxmForce, &d_rvecForceToAdd, + &baseForce_, &cellInfo_.d_cell, &numAtoms_); + + launchGpuKernel(kernelFn, config, deviceStream_, nullptr, "Force Reduction", kernelArgs); + + // Mark that kernel has been launched + if (completionMarker_ != nullptr) + { + completionMarker_->markEvent(deviceStream_); + } +} + +GpuForceReduction::Impl::~Impl(){}; + +GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext, const DeviceStream& deviceStream) : + impl_(new Impl(deviceContext, deviceStream)) +{ +} + +void GpuForceReduction::registerNbnxmForce(void* forcePtr) +{ + impl_->registerNbnxmForce(reinterpret_cast>(forcePtr)); +} + +void GpuForceReduction::registerRvecForce(void* forcePtr) +{ + impl_->registerRvecForce(reinterpret_cast>(forcePtr)); +} + +void GpuForceReduction::addDependency(GpuEventSynchronizer* const dependency) +{ + impl_->addDependency(dependency); +} + +void GpuForceReduction::reinit(DeviceBuffer baseForcePtr, + const int numAtoms, + ArrayRef cell, + const int atomStart, + const bool accumulate, + GpuEventSynchronizer* completionMarker) +{ + impl_->reinit(asFloat3(baseForcePtr), numAtoms, cell, atomStart, accumulate, completionMarker); +} +void GpuForceReduction::execute() +{ + impl_->execute(); +} + +GpuForceReduction::~GpuForceReduction() = default; + +} // namespace gmx diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cuh b/src/gromacs/mdlib/gpuforcereduction_impl.cuh new file mode 100644 index 0000000000..536e3fd33b --- /dev/null +++ b/src/gromacs/mdlib/gpuforcereduction_impl.cuh @@ -0,0 +1,144 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2020, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief Declares the GPU Force Reduction + * + * \author Alan Gray + * + * \ingroup module_mdlib + */ +#ifndef GMX_MDLIB_GPUFORCEREDUCTION_IMPL_H +#define GMX_MDLIB_GPUFORCEREDUCTION_IMPL_H + +#include "gromacs/gpu_utils/device_stream.h" +#include "gromacs/gpu_utils/devicebuffer_datatype.h" +#include "gromacs/math/vectypes.h" + +#include "gpuforcereduction.h" + +namespace gmx +{ + +//! structure to hold cell information for any nbat-format forces +struct cellInfo +{ + //! cell index mapping for any nbat-format forces + const int* cell = nullptr; + //! device copy of cell index mapping for any nbat-format forces + int* d_cell = nullptr; + //! number of atoms in cell array + int cellSize = -1; + //! number of atoms allocated in cell array + int cellSizeAlloc = -1; +}; + +class GpuForceReduction::Impl +{ + +public: + /*! \brief Creates GPU force reduction object + * + * \param [in] deviceStream Stream to use for reduction + * \param [in] deviceContext GPU device context + */ + Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream); + ~Impl(); + + /*! \brief Register a nbnxm-format force to be reduced + * + * \param [in] forcePtr Pointer to force to be reduced + */ + void registerNbnxmForce(DeviceBuffer forcePtr); + + /*! \brief Register a rvec-format force to be reduced + * + * \param [in] forcePtr Pointer to force to be reduced + */ + void registerRvecForce(DeviceBuffer forcePtr); + + /*! \brief Add a dependency for this force reduction + * + * \param [in] dependency Dependency for this reduction + */ + void addDependency(GpuEventSynchronizer* const dependency); + + /*! \brief Reinitialize the GPU force reduction + * + * \param [in] baseForcePtr Pointer to force to be used as a base + * \param [in] numAtoms The number of atoms + * \param [in] cell Pointer to the cell array + * \param [in] atomStart The start atom for the reduction + * \param [in] accumulate Whether reduction should be accumulated + * \param [in] completionMarker Event to be marked when launch of reduction is complete + */ + void reinit(float3* baseForcePtr, + const int numAtoms, + ArrayRef cell, + const int atomStart, + const bool accumulate, + GpuEventSynchronizer* completionMarker = nullptr); + + /*! \brief Execute the force reduction */ + void execute(); + +private: + //! force to be used as a base for this reduction + float3* baseForce_ = nullptr; + //! starting atom + int atomStart_ = 0; + //! number of atoms + int numAtoms_ = 0; + //! whether reduction is accumulated into base force buffer + int accumulate_ = true; + //! cell information for any nbat-format forces + struct cellInfo cellInfo_; + //! GPU context object + const DeviceContext& deviceContext_; + //! list of dependencies + gmx::FixedCapacityVector dependencyList_; + //! stream to be used for this reduction + const DeviceStream& deviceStream_; + //! Nbnxm force to be added in this reduction + DeviceBuffer nbnxmForceToAdd_ = nullptr; + //! Rvec-format force to be added in this reduction + DeviceBuffer rvecForceToAdd_ = nullptr; + //! event to be marked when redcution launch has been completed + GpuEventSynchronizer* completionMarker_ = nullptr; +}; + +} // namespace gmx + +#endif diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index f8befd3ab7..eed67b4540 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -121,6 +121,8 @@ #include "gromacs/utility/strconvert.h" #include "gromacs/utility/sysinfo.h" +#include "gpuforcereduction.h" + using gmx::ArrayRef; using gmx::AtomLocality; using gmx::DomainLifetimeWorkload; @@ -1006,6 +1008,84 @@ static void reduceAndUpdateMuTot(DipoleData* dipoleData, } } + +/*! \brief Setup for the local and non-local GPU force reductions: + * reinitialization plus the registration of forces and dependencies. + * + * \param [in] runScheduleWork Schedule workload flag structure + * \param [in] cr Communication record object + * \param [in] fr Force record object + * \param [in] ddUsesGpuDirectCommunication Whether GPU direct communication is in use + */ +static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork, + const t_commrec* cr, + t_forcerec* fr, + bool ddUsesGpuDirectCommunication) +{ + + nonbonded_verlet_t* nbv = fr->nbv.get(); + gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu; + + // (re-)initialize local GPU force reduction + const bool accumulate = + runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr); + const int atomStart = 0; + fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit( + stateGpu->getForces(), nbv->getNumAtoms(AtomLocality::Local), nbv->getGridIndices(), + atomStart, accumulate, stateGpu->fReducedOnDevice()); + + // register forces and add dependencies + fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(nbv->getGpuForces()); + + if (runScheduleWork->simulationWork.useGpuPme + && (thisRankHasDuty(cr, DUTY_PME) || runScheduleWork->simulationWork.useGpuPmePpCommunication)) + { + void* forcePtr = thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata) + : // PME force buffer on same GPU + fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU + fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr); + + GpuEventSynchronizer* const pmeSynchronizer = + (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_f_ready_synchronizer(fr->pmedata) + : // PME force buffer on same GPU + fr->pmePpCommGpu->getForcesReadySynchronizer()); // buffer received from other GPU + fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(pmeSynchronizer); + } + + if ((runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr)) + && !ddUsesGpuDirectCommunication) + { + fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency( + stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::Local, true)); + } + + if (ddUsesGpuDirectCommunication) + { + fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency( + cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent()); + } + + if (havePPDomainDecomposition(cr)) + { + // (re-)initialize non-local GPU force reduction + const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork + || runScheduleWork->domainWork.haveFreeEnergyWork; + const int atomStart = dd_numHomeAtoms(*cr->dd); + fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit( + stateGpu->getForces(), nbv->getNumAtoms(AtomLocality::NonLocal), + nbv->getGridIndices(), atomStart, accumulate); + + // register forces and add dependencies + fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->registerNbnxmForce(nbv->getGpuForces()); + if (runScheduleWork->domainWork.haveCpuBondedWork || runScheduleWork->domainWork.haveFreeEnergyWork) + { + fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->addDependency( + stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::NonLocal, true)); + } + } +} + + void do_force(FILE* fplog, const t_commrec* cr, const gmx_multisim_t* ms, @@ -1191,6 +1271,8 @@ void do_force(FILE* fplog, launchPmeGpuSpread(fr->pmedata, box, stepWork, localXReadyOnDevice, lambda[efptCOUL], wcycle); } + const gmx::DomainLifetimeWorkload& domainWork = runScheduleWork->domainWork; + /* do gridding for pair search */ if (stepWork.doNeighborSearch) { @@ -1276,13 +1358,10 @@ void do_force(FILE* fplog, { nbv->atomdata_init_copy_x_to_nbat_x_gpu(); } - // For force buffer ops, we use the below conditon rather than - // useGpuFBufferOps to ensure that init is performed even if this - // NS step is also a virial step (on which f buf ops are deactivated). - if (GMX_GPU_CUDA && simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded) + + if (simulationWork.useGpuBufferOps) { - GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded"); - nbv->atomdata_init_add_nbat_f_to_f_gpu(stateGpu->fReducedOnDevice()); + setupGpuForceReductions(runScheduleWork, cr, fr, ddUsesGpuDirectCommunication); } } else if (!EI_TPI(inputrec->eI)) @@ -1306,8 +1385,6 @@ void do_force(FILE* fplog, } } - const gmx::DomainLifetimeWorkload& domainWork = runScheduleWork->domainWork; - if (simulationWork.useGpuNonbonded) { ddBalanceRegionHandler.openBeforeForceComputationGpu(); @@ -1686,8 +1763,6 @@ void do_force(FILE* fplog, if (stepWork.useGpuFBufferOps) { - gmx::FixedCapacityVector dependencyList; - // TODO: move this into DomainLifetimeWorkload, including the second part of the // condition The bonded and free energy CPU tasks can have non-local force // contributions which are a dependency for the GPU force reduction. @@ -1698,13 +1773,10 @@ void do_force(FILE* fplog, { stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), AtomLocality::NonLocal); - dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent( - AtomLocality::NonLocal, stepWork.useGpuFBufferOps)); } - nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::NonLocal, stateGpu->getForces(), - pme_gpu_get_device_f(fr->pmedata), dependencyList, - false, haveNonLocalForceContribInCpuBuffer); + fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->execute(); + if (!useGpuForcesHaloExchange) { // copy from GPU input for dd_move_f() @@ -1830,29 +1902,6 @@ void do_force(FILE* fplog, * on the non-alternating path. */ if (useOrEmulateGpuNb && !alternateGpuWait) { - // TODO simplify the below conditionals. Pass buffer and sync pointers at init stage rather than here. Unify getter fns for sameGPU/otherGPU cases. - void* pmeForcePtr = - stepWork.useGpuPmeFReduction - ? (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata) - : // PME force buffer on same GPU - fr->pmePpCommGpu->getGpuForceStagingPtr()) // buffer received from other GPU - : nullptr; // PME reduction not active on GPU - - GpuEventSynchronizer* const pmeSynchronizer = - stepWork.useGpuPmeFReduction - ? (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_f_ready_synchronizer(fr->pmedata) - : // PME force buffer on same GPU - static_cast( - fr->pmePpCommGpu->getForcesReadySynchronizer())) // buffer received from other GPU - : nullptr; // PME reduction not active on GPU - - gmx::FixedCapacityVector dependencyList; - - if (stepWork.useGpuPmeFReduction) - { - dependencyList.push_back(pmeSynchronizer); - } - gmx::ArrayRef forceWithShift = forceOut.forceWithShiftForces().force(); if (stepWork.useGpuFBufferOps) @@ -1881,16 +1930,10 @@ void do_force(FILE* fplog, auto locality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All; stateGpu->copyForcesToGpu(forceWithShift, locality); - dependencyList.push_back( - stateGpu->getForcesReadyOnDeviceEvent(locality, stepWork.useGpuFBufferOps)); - } - if (useGpuForcesHaloExchange) - { - dependencyList.push_back(cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent()); } - nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr, - dependencyList, stepWork.useGpuPmeFReduction, - haveLocalForceContribInCpuBuffer); + + fr->gpuForceReduction[gmx::AtomLocality::Local]->execute(); + // Copy forces to host if they are needed for update or if virtual sites are enabled. // If there are vsites, we need to copy forces every step to spread vsite forces on host. // TODO: When the output flags will be included in step workload, this copy can be combined with the diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index e774622f2d..c88d8adbea 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -95,6 +95,7 @@ #include "gromacs/mdlib/force.h" #include "gromacs/mdlib/forcerec.h" #include "gromacs/mdlib/gmx_omp_nthreads.h" +#include "gromacs/mdlib/gpuforcereduction.h" #include "gromacs/mdlib/makeconstraints.h" #include "gromacs/mdlib/md_support.h" #include "gromacs/mdlib/mdatoms.h" @@ -1672,6 +1673,16 @@ int Mdrunner::mdrunner() domdecOptions.checkBondedInteractions, fr->cginfo_mb); } + if (runScheduleWork.simulationWork.useGpuBufferOps) + { + fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique( + deviceStreamManager->context(), + deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal)); + fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique( + deviceStreamManager->context(), + deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal)); + } + std::unique_ptr stateGpu; if (gpusWereDetected && ((runScheduleWork.simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME)) diff --git a/src/gromacs/mdtypes/forcerec.h b/src/gromacs/mdtypes/forcerec.h index d5f084faed..4fa0a03f80 100644 --- a/src/gromacs/mdtypes/forcerec.h +++ b/src/gromacs/mdtypes/forcerec.h @@ -49,6 +49,8 @@ #include "gromacs/utility/basedefinitions.h" #include "gromacs/utility/real.h" +#include "locality.h" + /* Abstract type for PME that is defined only in the routine that use them. */ struct gmx_pme_t; struct nonbonded_verlet_t; @@ -64,6 +66,7 @@ namespace gmx { class DeviceStreamManager; class GpuBonded; +class GpuForceReduction; class ForceProviders; class StatePropagatorDataGpu; class PmePpCommGpu; @@ -325,6 +328,9 @@ struct t_forcerec /* For PME-PP GPU communication */ std::unique_ptr pmePpCommGpu; + + /* For GPU force reduction (on both local and non-local atoms) */ + gmx::EnumerationArray> gpuForceReduction; }; /* Important: Starting with Gromacs-4.6, the values of c6 and c12 in the nbfp array have diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp index 12003eb236..dd4eddd0ca 100644 --- a/src/gromacs/nbnxm/atomdata.cpp +++ b/src/gromacs/nbnxm/atomdata.cpp @@ -1468,31 +1468,6 @@ void reduceForces(nbnxn_atomdata_t* nbat, const gmx::AtomLocality locality, cons } } -/* Add the force array(s) from nbnxn_atomdata_t to f */ -void reduceForcesGpu(const gmx::AtomLocality locality, - DeviceBuffer totalForcesDevice, - const Nbnxm::GridSet& gridSet, - void* pmeForcesDevice, - gmx::ArrayRef dependencyList, - NbnxmGpu* gpu_nbv, - bool useGpuFPmeReduction, - bool accumulateForce) -{ - int atomsStart = 0; - int numAtoms = 0; - - nbnxn_get_atom_range(locality, gridSet, &atomsStart, &numAtoms); - - if (numAtoms == 0) - { - /* The are no atoms for this reduction, avoid some overhead */ - return; - } - - Nbnxm::nbnxn_gpu_add_nbat_f_to_f(locality, totalForcesDevice, gpu_nbv, pmeForcesDevice, dependencyList, - atomsStart, numAtoms, useGpuFPmeReduction, accumulateForce); -} - void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t& nbat, gmx::ArrayRef fshift) { gmx::ArrayRef outputBuffers = nbat.out; diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h index 007e2268ef..df3362b114 100644 --- a/src/gromacs/nbnxm/atomdata.h +++ b/src/gromacs/nbnxm/atomdata.h @@ -381,26 +381,6 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet& gridSet, */ void reduceForces(nbnxn_atomdata_t* nbat, gmx::AtomLocality locality, const Nbnxm::GridSet& gridSet, rvec* totalForce); -/*! \brief Reduce forces on the GPU - * - * \param[in] locality If the reduction should be performed on local or non-local atoms. - * \param[out] totalForcesDevice Device buffer to accumulate resulting force. - * \param[in] gridSet The grids data. - * \param[in] pmeForcesDevice Device buffer with PME forces. - * \param[in] dependencyList List of synchronizers that represent the dependencies the reduction task needs to sync on. - * \param[in] gpu_nbv The NBNXM GPU data structure. - * \param[in] useGpuFPmeReduction Whether PME forces should be added. - * \param[in] accumulateForce Whether there are usefull data already in the total force buffer. - */ -void reduceForcesGpu(gmx::AtomLocality locality, - DeviceBuffer totalForcesDevice, - const Nbnxm::GridSet& gridSet, - void* pmeForcesDevice, - gmx::ArrayRef dependencyList, - NbnxmGpu* gpu_nbv, - bool useGpuFPmeReduction, - bool accumulateForce); - //! Add the fshift force stored in nbat to fshift void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t& nbat, gmx::ArrayRef fshift); diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 5f41fda5ef..48f02fde9e 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -894,78 +894,9 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, nbnxnInsertNonlocalGpuDependency(nb, interactionLoc); } -/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. - * - * NOTE: When the total force device buffer is reallocated and its size increases, it is cleared in - * Local stream. Hence, if accumulateForce is true, NonLocal stream should start accumulating - * forces only after Local stream already done so. - */ -void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality, - DeviceBuffer totalForcesDevice, - NbnxmGpu* nb, - void* pmeForcesDevice, - gmx::ArrayRef dependencyList, - int atomStart, - int numAtoms, - bool useGpuFPmeReduction, - bool accumulateForce) +void* getGpuForces(NbnxmGpu* nb) { - GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - GMX_ASSERT(numAtoms != 0, "Cannot call function with no atoms"); - GMX_ASSERT(totalForcesDevice, "Need a valid totalForcesDevice pointer"); - - const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality); - const DeviceStream& deviceStream = *nb->deviceStreams[iLocality]; - cu_atomdata_t* adat = nb->atdat; - - size_t gmx_used_in_debug numDependency = static_cast((useGpuFPmeReduction == true)) - + static_cast((accumulateForce == true)); - GMX_ASSERT(numDependency >= dependencyList.size(), - "Mismatching number of dependencies and call signature"); - - // Enqueue wait on all dependencies passed - for (auto const synchronizer : dependencyList) - { - synchronizer->enqueueWaitEvent(deviceStream); - } - - /* launch kernel */ - - KernelLaunchConfig config; - config.blockSize[0] = c_bufOpsThreadsPerBlock; - config.blockSize[1] = 1; - config.blockSize[2] = 1; - config.gridSize[0] = ((numAtoms + 1) + c_bufOpsThreadsPerBlock - 1) / c_bufOpsThreadsPerBlock; - config.gridSize[1] = 1; - config.gridSize[2] = 1; - config.sharedMemorySize = 0; - - auto kernelFn = accumulateForce ? nbnxn_gpu_add_nbat_f_to_f_kernel - : nbnxn_gpu_add_nbat_f_to_f_kernel; - - if (useGpuFPmeReduction) - { - GMX_ASSERT(pmeForcesDevice, "Need a valid pmeForcesDevice pointer"); - kernelFn = accumulateForce ? nbnxn_gpu_add_nbat_f_to_f_kernel - : nbnxn_gpu_add_nbat_f_to_f_kernel; - } - - const float3* d_fNB = adat->f; - const float3* d_fPme = static_cast(pmeForcesDevice); - float3* d_fTotal = asFloat3(totalForcesDevice); - const int* d_cell = nb->cell; - - const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &d_fNB, &d_fPme, &d_fTotal, - &d_cell, &atomStart, &numAtoms); - - launchGpuKernel(kernelFn, config, deviceStream, nullptr, "FbufferOps", kernelArgs); - - if (atomLocality == AtomLocality::Local) - { - GMX_ASSERT(nb->localFReductionDone != nullptr, - "localFReductionDone has to be a valid pointer"); - nb->localFReductionDone->markEvent(deviceStream); - } + return nb->atdat->f; } } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index f754cc4795..b1d6774a26 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -321,8 +321,6 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, nb->ncxy_na_alloc = 0; nb->ncxy_ind = 0; nb->ncxy_ind_alloc = 0; - nb->ncell = 0; - nb->ncell_alloc = 0; if (debug) { @@ -659,27 +657,4 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv return; } -/* Initialization for F buffer operations on GPU. */ -void nbnxn_gpu_init_add_nbat_f_to_f(const int* cell, - NbnxmGpu* gpu_nbv, - int natoms_total, - GpuEventSynchronizer* const localReductionDone) -{ - - const DeviceStream& deviceStream = *gpu_nbv->deviceStreams[InteractionLocality::Local]; - - GMX_ASSERT(localReductionDone, "localReductionDone should be a valid pointer"); - gpu_nbv->localFReductionDone = localReductionDone; - - if (natoms_total > 0) - { - reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, - *gpu_nbv->deviceContext_); - copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, deviceStream, - GpuApiCallBehavior::Async, nullptr); - } - - return; -} - } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index 1044c10162..7c92a1abdc 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -158,12 +158,6 @@ struct NbnxmGpu bool bUseTwoStreams = false; /*! \brief atom data */ cu_atomdata_t* atdat = nullptr; - /*! \brief f buf ops cell index mapping */ - int* cell = nullptr; - /*! \brief number of indices in cell buffer */ - int ncell = 0; - /*! \brief number of indices allocated in cell buffer */ - int ncell_alloc = 0; /*! \brief array of atom indices */ int* atomIndices = nullptr; /*! \brief size of atom indices */ @@ -214,14 +208,6 @@ struct NbnxmGpu * will be true. */ gmx::EnumerationArray haveWork = { { false } }; - /*! \brief Pointer to event synchronizer triggered when the local - * GPU buffer ops / reduction is complete - * - * \note That the synchronizer is managed outside of this module - * in StatePropagatorDataGpu. - */ - GpuEventSynchronizer* localFReductionDone = nullptr; - /*! \brief Event triggered when non-local coordinate buffer * has been copied from device to host. */ GpuEventSynchronizer* xNonLocalCopyD2HDone = nullptr; diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 67890a9606..76e2eead9f 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -179,48 +179,27 @@ void nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const gmx::AtomLocality local wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); } -void nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const gmx::AtomLocality locality, - DeviceBuffer totalForcesDevice, - void* forcesPmeDevice, - gmx::ArrayRef dependencyList, - bool useGpuFPmeReduction, - bool accumulateForce) +int nonbonded_verlet_t::getNumAtoms(const gmx::AtomLocality locality) { - - GMX_ASSERT((useGpuFPmeReduction == (forcesPmeDevice != nullptr)), - "GPU PME force reduction is only valid when a non-null GPU PME force pointer is " - "available"); - - /* Skip the reduction if there was no short-range GPU work to do - * (either NB or both NB and bonded work). */ - if (!pairlistIsSimple() && !Nbnxm::haveGpuShortRangeWork(gpu_nbv, locality)) + int numAtoms = 0; + switch (locality) { - return; + case gmx::AtomLocality::All: numAtoms = pairSearch_->gridSet().numRealAtomsTotal(); break; + case gmx::AtomLocality::Local: numAtoms = pairSearch_->gridSet().numRealAtomsLocal(); break; + case gmx::AtomLocality::NonLocal: + numAtoms = pairSearch_->gridSet().numRealAtomsTotal() + - pairSearch_->gridSet().numRealAtomsLocal(); + break; + case gmx::AtomLocality::Count: + GMX_ASSERT(false, "Count is invalid locality specifier"); + break; } - - wallcycle_start(wcycle_, ewcLAUNCH_GPU); - wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS); - - reduceForcesGpu(locality, totalForcesDevice, pairSearch_->gridSet(), forcesPmeDevice, - dependencyList, gpu_nbv, useGpuFPmeReduction, accumulateForce); - - wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS); - wallcycle_stop(wcycle_, ewcLAUNCH_GPU); + return numAtoms; } -void nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* const localReductionDone) +void* nonbonded_verlet_t::getGpuForces() { - - wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); - wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS); - - const Nbnxm::GridSet& gridSet = pairSearch_->gridSet(); - - Nbnxm::nbnxn_gpu_init_add_nbat_f_to_f(gridSet.cells().data(), gpu_nbv, - gridSet.numRealAtomsTotal(), localReductionDone); - - wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS); - wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); + return Nbnxm::getGpuForces(gpu_nbv); } real nonbonded_verlet_t::pairlistInnerRadius() const diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index bcb3b34c33..2596350e52 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -358,14 +358,18 @@ public: bool useGpuFPmeReduction, bool accumulateForce); - /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. + /*! \brief Get the number of atoms for a given locality * - * \param localReductionDone Pointer to an event synchronizer that marks the completion of the local f buffer ops kernel. + * \param [in] locality Local or non-local + * \returns The number of atoms for given locality */ - void atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* localReductionDone); + int getNumAtoms(gmx::AtomLocality locality); - /*! \brief return GPU pointer to f in rvec format */ - void* get_gpu_frvec(); + /*! \brief Get the pointer to the GPU nonbonded force buffer + * + * \returns A pointer to the force buffer in GPU memory + */ + void* getGpuForces(); //! Return the kernel setup const Nbnxm::KernelSetup& kernelSetup() const { return kernelSetup_; } diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index eace699386..00e7ae11f5 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -340,45 +340,19 @@ GPU_FUNC_QUALIFIER bool haveGpuShortRangeWork(const NbnxmGpu gmx_unused* nb, gmx::AtomLocality gmx_unused aLocality) GPU_FUNC_TERM_WITH_RETURN(false); -/*! \brief Initialization for F buffer operations on GPU */ -CUDA_FUNC_QUALIFIER -void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused* cell, - NbnxmGpu gmx_unused* gpu_nbv, - int gmx_unused natoms_total, - GpuEventSynchronizer gmx_unused* localReductionDone) CUDA_FUNC_TERM; - -/*! \brief Force buffer operations on GPU. - * - * Transforms non-bonded forces into plain rvec format and add all the force components to the total - * force buffer - * - * \param[in] atomLocality If the reduction should be performed on local or non-local atoms. - * \param[in] totalForcesDevice Device buffer to accumulate resulting force. - * \param[in] gpu_nbv The NBNXM GPU data structure. - * \param[in] pmeForcesDevice Device buffer with PME forces. - * \param[in] dependencyList List of synchronizers that represent the dependencies the reduction task needs to sync on. - * \param[in] atomStart Index of the first atom to reduce forces for. - * \param[in] numAtoms Number of atoms to reduce forces for. - * \param[in] useGpuFPmeReduction Whether PME forces should be added. - * \param[in] accumulateForce Whether there are usefull data already in the total force buffer. - * - */ -CUDA_FUNC_QUALIFIER -void nbnxn_gpu_add_nbat_f_to_f(gmx::AtomLocality gmx_unused atomLocality, - DeviceBuffer gmx_unused totalForcesDevice, - NbnxmGpu gmx_unused* gpu_nbv, - void gmx_unused* pmeForcesDevice, - gmx::ArrayRef gmx_unused dependencyList, - int gmx_unused atomStart, - int gmx_unused numAtoms, - bool gmx_unused useGpuFPmeReduction, - bool gmx_unused accumulateForce) CUDA_FUNC_TERM; - /*! \brief sync CPU thread on coordinate copy to device * \param[in] nb The nonbonded data GPU structure */ CUDA_FUNC_QUALIFIER void nbnxn_wait_x_on_device(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM; +/*! \brief Get the pointer to the GPU nonbonded force buffer + * + * \param[in] nb The nonbonded data GPU structure + * \returns A pointer to the force buffer in GPU memory + */ +CUDA_FUNC_QUALIFIER +void* getGpuForces(NbnxmGpu gmx_unused* nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr); + } // namespace Nbnxm #endif -- 2.22.0