This patch performs GPU buffer ops for force buffers.
Enable with GMX_USE_GPU_BUFFER_OPS env variable.
Currently, the H2D transfer of the force buffer is switched on with
haveSpecialForces || haveCpuBondedWork || haveCpuPmeWork,
where haveCpuPmeWork is true even when useGpuPme == true
until on-GPU PME-nonbonded reduction is added in follow-up.
TODO: enable PME reduction in GPU buffer ops and remove associated H2D
transfer
Implements part of #2817
Change-Id: Ice984425301d24bac1340e883698244489cd686e
// PME-first ordering would suffice).
static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr);
-// environment variable to enable GPU buffer ops, to alow incremental and optional
+// environment variable to enable GPU buffer ops, to allow incremental and optional
// introduction of this functionality.
// TODO eventially tie this in with other existing GPU flags.
static const bool c_enableGpuBufOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
-
static void sum_forces(rvec f[], gmx::ArrayRef<const gmx::RVec> forceToAdd)
{
const int end = forceToAdd.size();
wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
- as_rvec_array(force->unpaddedArrayRef().data()), wcycle);
+ as_rvec_array(force->unpaddedArrayRef().data()),
+ BufferOpsUseGpu::False,
+ GpuBufferOpsAccumulateForce::Null,
+ wcycle);
}
}
}
((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
+ const BufferOpsUseGpu useGpuFBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA))
+ && !(flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) ?
+ BufferOpsUseGpu::True : BufferOpsUseGpu::False;
- const bool useGpuXBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA));
+ const BufferOpsUseGpu useGpuXBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA)) ?
+ BufferOpsUseGpu::True : BufferOpsUseGpu::False;;
/* At a search step we need to start the first balancing region
* somewhere early inside the step after communication during domain
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
wallcycle_stop(wcycle, ewcNS);
- if (useGpuXBufOps)
+ if (useGpuXBufOps == BufferOpsUseGpu::True)
{
nbv->atomdata_init_copy_x_to_nbat_x_gpu();
}
-
+ // For force buffer ops, we use the below conditon rather than
+ // useGpuFBufOps to ensure that init is performed even if this
+ // NS step is also a virial step (on which f buf ops are deactivated).
+ if (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA))
+ {
+ nbv->atomdata_init_add_nbat_f_to_f_gpu(wcycle);
+ }
}
else
{
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
- if (bNS || !useGpuXBufOps)
+ if (bNS || (useGpuXBufOps == BufferOpsUseGpu::False))
{
Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
Nbnxm::AtomLocality::Local);
{
wallcycle_start(wcycle, ewcLAUNCH_GPU);
- if (bNS || !useGpuXBufOps)
+ if (bNS || (useGpuXBufOps == BufferOpsUseGpu::False))
{
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
/* launch D2H copy-back F */
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+
+ bool copyBackNbForce = (useGpuFBufOps == BufferOpsUseGpu::False);
+
if (havePPDomainDecomposition(cr))
{
Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
- flags, Nbnxm::AtomLocality::NonLocal);
+ flags, Nbnxm::AtomLocality::NonLocal, copyBackNbForce);
}
Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
- flags, Nbnxm::AtomLocality::Local);
+ flags, Nbnxm::AtomLocality::Local, copyBackNbForce);
+
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY))
* communication with calculation with domain decomposition.
*/
wallcycle_stop(wcycle, ewcFORCE);
-
- nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f, wcycle);
+ nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f,
+ BufferOpsUseGpu::False,
+ GpuBufferOpsAccumulateForce::Null,
+ wcycle);
wallcycle_start_nocount(wcycle, ewcFORCE);
flags, &forceOut.forceWithVirial, enerd,
ed, bNS);
+ // flag to specify if CPU force output is preset in force
+ // buffer. For now, this is true even when useGpuPme == true
+ // (because on-GPU PME-nonbonded reduction will be added in
+ // follow-up)
+ // TODO adapt the below when on-GPU PME-nonbonded reduction is available.
+ bool useCpuPmeReduction = true;
+ bool haveCpuForces = (ppForceWorkload->haveSpecialForces || ppForceWorkload->haveCpuListedForceWork || useCpuPmeReduction);
+ // flag to specify if forces should be accumulated in force buffer
+ // ops. For now, this is solely determined by above haveCpuForces
+ // flag, but in future developments it will also depend on
+ // e.g. whether the GPU force halo exchange is active.
+ GpuBufferOpsAccumulateForce accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) &&
+ haveCpuForces ? GpuBufferOpsAccumulateForce::True :
+ GpuBufferOpsAccumulateForce::False;
+
// Will store the amount of cycles spent waiting for the GPU that
// will be later used in the DLB accounting.
float cycles_wait_gpu = 0;
wallcycle_stop(wcycle, ewcFORCE);
}
+ if (useGpuFBufOps == BufferOpsUseGpu::True && haveCpuForces)
+ {
+ nbv->launch_copy_f_to_gpu(forceOut.f, Nbnxm::AtomLocality::NonLocal);
+ }
nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal,
- forceOut.f, wcycle);
+ forceOut.f, useGpuFBufOps, accumulateForce, wcycle);
+ if (useGpuFBufOps == BufferOpsUseGpu::True)
+ {
+ nbv->launch_copy_f_from_gpu(forceOut.f, Nbnxm::AtomLocality::NonLocal);
+ }
if (fr->nbv->emulateGpu() && (flags & GMX_FORCE_VIRIAL))
{
if (bDoForces)
{
+ if (useGpuFBufOps == BufferOpsUseGpu::True)
+ {
+ nbv->wait_stream_gpu(Nbnxm::AtomLocality::NonLocal);
+ }
dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
}
}
// With both nonbonded and PME offloaded a GPU on the same rank, we use
// an alternating wait/reduction scheme.
- bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr));
+ bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr) &&
+ (useGpuFBufOps == BufferOpsUseGpu::False));
if (alternateGpuWait)
{
alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &force, &forceOut.forceWithVirial, fr->fshift, enerd,
pme_gpu_reinit_computation(fr->pmedata, wcycle);
}
+ /* Do the nonbonded GPU (or emulation) force buffer reduction
+ * on the non-alternating path. */
+ if (bUseOrEmulGPU && !alternateGpuWait)
+ {
+
+ if (useGpuFBufOps == BufferOpsUseGpu::True && haveCpuForces)
+ {
+ nbv->launch_copy_f_to_gpu(forceOut.f, Nbnxm::AtomLocality::Local);
+ }
+ nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
+ forceOut.f, useGpuFBufOps, accumulateForce, wcycle);
+ if (useGpuFBufOps == BufferOpsUseGpu::True)
+ {
+ nbv->launch_copy_f_from_gpu(forceOut.f, Nbnxm::AtomLocality::Local);
+ nbv->wait_stream_gpu(Nbnxm::AtomLocality::Local);
+ }
+
+ }
+
if (bUseGPU)
{
/* now clear the GPU outputs while we finish the step on the CPU */
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
- /* Do the nonbonded GPU (or emulation) force buffer reduction
- * on the non-alternating path. */
- if (bUseOrEmulGPU && !alternateGpuWait)
- {
- nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
- forceOut.f, wcycle);
- }
if (DOMAINDECOMP(cr))
{
dd_force_flop_stop(cr->dd, nrnb);
{
int gridBegin = 0;
int gridEnd = 0;
+
switch (locality)
{
case Nbnxm::AtomLocality::All:
}
}
+
/* Add the force array(s) from nbnxn_atomdata_t to f */
-void reduceForces(nbnxn_atomdata_t *nbat,
- const Nbnxm::AtomLocality locality,
- const Nbnxm::GridSet &gridSet,
- rvec *f)
+template <bool useGpu>
+void reduceForces(nbnxn_atomdata_t *nbat,
+ const Nbnxm::AtomLocality locality,
+ const Nbnxm::GridSet &gridSet,
+ rvec *f,
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ GpuBufferOpsAccumulateForce accumulateForce)
{
int a0 = 0;
int na = 0;
- switch (locality)
- {
- case Nbnxm::AtomLocality::All:
- a0 = 0;
- na = gridSet.numRealAtomsTotal();
- break;
- case Nbnxm::AtomLocality::Local:
- a0 = 0;
- na = gridSet.numRealAtomsLocal();
- break;
- case Nbnxm::AtomLocality::NonLocal:
- a0 = gridSet.numRealAtomsLocal();
- na = gridSet.numRealAtomsTotal() - gridSet.numRealAtomsLocal();
- break;
- case Nbnxm::AtomLocality::Count:
- GMX_ASSERT(false, "Count is invalid locality specifier");
- break;
- }
+
+ nbnxn_get_atom_range(locality, gridSet, &a0, &na);
if (na == 0)
{
return;
}
- int nth = gmx_omp_nthreads_get(emntNonbonded);
+ if (useGpu)
+ {
+ Nbnxm::nbnxn_gpu_add_nbat_f_to_f(locality,
+ gpu_nbv,
+ a0, na,
+ accumulateForce);
- if (nbat->out.size() > 1)
+ }
+ else
{
- if (locality != Nbnxm::AtomLocality::All)
- {
- gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
- }
+ int nth = gmx_omp_nthreads_get(emntNonbonded);
- /* Reduce the force thread output buffers into buffer 0, before adding
- * them to the, differently ordered, "real" force buffer.
- */
- if (nbat->bUseTreeReduce)
+ if (nbat->out.size() > 1)
{
- nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat, nth);
- }
- else
- {
- nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat, nth);
+ if (locality != Nbnxm::AtomLocality::All)
+ {
+ gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll");
+ }
+
+ /* Reduce the force thread output buffers into buffer 0, before adding
+ * them to the, differently ordered, "real" force buffer.
+ */
+ if (nbat->bUseTreeReduce)
+ {
+ nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat, nth);
+ }
+ else
+ {
+ nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat, nth);
+ }
}
- }
#pragma omp parallel for num_threads(nth) schedule(static)
- for (int th = 0; th < nth; th++)
- {
- try
+ for (int th = 0; th < nth; th++)
{
- nbnxn_atomdata_add_nbat_f_to_f_part(gridSet, *nbat,
- nbat->out[0],
- a0 + ((th + 0)*na)/nth,
- a0 + ((th + 1)*na)/nth,
- f);
+ try
+ {
+ nbnxn_atomdata_add_nbat_f_to_f_part(gridSet, *nbat,
+ nbat->out[0],
+ a0 + ((th + 0)*na)/nth,
+ a0 + ((th + 1)*na)/nth,
+ f);
+ }
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
}
- GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
}
}
+template
+void reduceForces<true>(nbnxn_atomdata_t *nbat,
+ const Nbnxm::AtomLocality locality,
+ const Nbnxm::GridSet &gridSet,
+ rvec *f,
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ GpuBufferOpsAccumulateForce accumulateForce);
+
+template
+void reduceForces<false>(nbnxn_atomdata_t *nbat,
+ const Nbnxm::AtomLocality locality,
+ const Nbnxm::GridSet &gridSet,
+ rvec *f,
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ GpuBufferOpsAccumulateForce accumulateForce);
void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
rvec *fshift)
rvec_inc(fshift[s], sum);
}
}
+
+void nbnxn_get_atom_range(const Nbnxm::AtomLocality atomLocality,
+ const Nbnxm::GridSet &gridSet,
+ int *atomStart,
+ int *nAtoms)
+{
+
+ switch (atomLocality)
+ {
+ case Nbnxm::AtomLocality::All:
+ *atomStart = 0;
+ *nAtoms = gridSet.numRealAtomsTotal();
+ break;
+ case Nbnxm::AtomLocality::Local:
+ *atomStart = 0;
+ *nAtoms = gridSet.numRealAtomsLocal();
+ break;
+ case Nbnxm::AtomLocality::NonLocal:
+ *atomStart = gridSet.numRealAtomsLocal();
+ *nAtoms = gridSet.numRealAtomsTotal() - gridSet.numRealAtomsLocal();
+ break;
+ case Nbnxm::AtomLocality::Count:
+ GMX_ASSERT(false, "Count is invalid locality specifier");
+ break;
+ }
+}
struct t_mdatoms;
struct tMPI_Atomic;
+enum class BufferOpsUseGpu;
+
namespace Nbnxm
{
class GridSet;
enum class KernelType;
}
+enum class GpuBufferOpsAccumulateForce;
+
/* Convenience type for vector with aligned memory */
template<typename T>
using AlignedVector = std::vector < T, gmx::AlignedAllocator < T>>;
void *);
//! Add the computed forces to \p f, an internal reduction might be performed as well
-void reduceForces(nbnxn_atomdata_t *nbat,
- Nbnxm::AtomLocality locality,
- const Nbnxm::GridSet &gridSet,
- rvec *f);
+template <bool useGpu>
+void reduceForces(nbnxn_atomdata_t *nbat,
+ Nbnxm::AtomLocality locality,
+ const Nbnxm::GridSet &gridSet,
+ rvec *f,
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ GpuBufferOpsAccumulateForce accumulateForce);
+
+
+extern template
+void reduceForces<true>(nbnxn_atomdata_t *nbat,
+ const Nbnxm::AtomLocality locality,
+ const Nbnxm::GridSet &gridSet,
+ rvec *f,
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ GpuBufferOpsAccumulateForce accumulateForce);
+
+extern template
+void reduceForces<false>(nbnxn_atomdata_t *nbat,
+ const Nbnxm::AtomLocality locality,
+ const Nbnxm::GridSet &gridSet,
+ rvec *f,
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ GpuBufferOpsAccumulateForce accumulateForce);
/* Add the fshift force stored in nbat to fshift */
void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
rvec *fshift);
+/* Get the atom start index and number of atoms for a given locality */
+void nbnxn_get_atom_range(Nbnxm::AtomLocality atomLocality,
+ const Nbnxm::GridSet &gridSet,
+ int *atomStart,
+ int *nAtoms);
#endif
/*! \internal \file
*
* \brief
- * CUDA kernel for GPU version of copy_rvec_to_nbat_real.
- * Converts coordinate data from rvec to nb format.
+ * CUDA kernels for GPU versions of copy_rvec_to_nbat_real and add_nbat_f_to_f.
*
* \author Alan Gray <alang@nvidia.com>
* \author Jon Vincent <jvincent@nvidia.com>
}
}
+
+/*! \brief CUDA kernel to add part of the force array(s) from nbnxn_atomdata_t to f
+ *
+ * \param[in] fnb Force in nbat format
+ * \param[in,out] f Force buffer to be reduced into
+ * \param[in] cell Cell index mapping
+ * \param[in] a0 start atom index
+ * \param[in] a1 end atom index
+ * \param[in] stride stride between atoms in memory
+ */
+template <bool accumulateForce>
+__global__ void
+nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb,
+ rvec * f,
+ const int *__restrict__ cell,
+ const int atomStart,
+ const int nAtoms);
+template <bool accumulateForce>
+__global__ void
+nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb,
+ rvec * f,
+ const int *__restrict__ cell,
+ const int atomStart,
+ const int nAtoms)
+{
+
+ /* map particle-level parallelism to 1D CUDA thread and block index */
+ int threadIndex = blockIdx.x*blockDim.x+threadIdx.x;
+
+ /* perform addition for each particle*/
+ if (threadIndex < nAtoms)
+ {
+
+ int i = cell[atomStart+threadIndex];
+ float3 *f_dest = (float3 *)&f[atomStart+threadIndex][XX];
+
+ if (accumulateForce)
+ {
+ *f_dest += fnb[i];
+ }
+ else
+ {
+ *f_dest = fnb[i];
+ }
+
+ }
+ return;
+}
#include "nbnxm_cuda.h"
#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/vectype_ops.cuh"
#include "gromacs/mdlib/force_flags.h"
#include "gromacs/nbnxm/atomdata.h"
#include "gromacs/nbnxm/gpu_common.h"
namespace Nbnxm
{
+//! Number of CUDA threads in a block
+//TODO Optimize this through experimentation
+constexpr static int c_bufOpsThreadsPerBlock = 128;
+
/*! Nonbonded kernel function pointer type */
typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t,
const cu_nbparam_t,
void gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb,
nbnxn_atomdata_t *nbatom,
const int flags,
- const AtomLocality atomLocality)
+ const AtomLocality atomLocality,
+ const bool copyBackNbForce)
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
}
/* DtoH f */
- cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
- (adat_len)*sizeof(*adat->f), stream);
+ if (copyBackNbForce)
+ {
+ cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
+ (adat_len)*sizeof(*adat->f), stream);
+ }
/* After the non-local D2H is launched the nonlocal_done event can be
recorded which signals that the local D2H can proceed. This event is not
}
/* launch kernel on GPU */
- const int threadsPerBlock = 128;
KernelLaunchConfig config;
- config.blockSize[0] = threadsPerBlock;
+ config.blockSize[0] = c_bufOpsThreadsPerBlock;
config.blockSize[1] = 1;
config.blockSize[2] = 1;
- config.gridSize[0] = (grid.numCellsColumnMax()*numAtomsPerCell + threadsPerBlock - 1)/threadsPerBlock;
+ config.gridSize[0] = (grid.numCellsColumnMax()*numAtomsPerCell + c_bufOpsThreadsPerBlock - 1)/c_bufOpsThreadsPerBlock;
config.gridSize[1] = numColumns;
config.gridSize[2] = 1;
GMX_ASSERT(config.gridSize[0] > 0, "Can not have empty grid, early return above avoids this");
nbnxnInsertNonlocalGpuDependency(nb, interactionLoc);
}
+/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */
+void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality,
+ gmx_nbnxn_gpu_t *nb,
+ int atomStart,
+ int nAtoms,
+ GpuBufferOpsAccumulateForce accumulateForce)
+{
+
+ cu_atomdata_t *adat = nb->atdat;
+ cudaStream_t stream = atomLocality == AtomLocality::Local ?
+ nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal];
+
+ /* launch kernel */
+
+ KernelLaunchConfig config;
+ config.blockSize[0] = c_bufOpsThreadsPerBlock;
+ config.blockSize[1] = 1;
+ config.blockSize[2] = 1;
+ config.gridSize[0] = ((nAtoms+1)+c_bufOpsThreadsPerBlock-1)/c_bufOpsThreadsPerBlock;
+ config.gridSize[1] = 1;
+ config.gridSize[2] = 1;
+ config.sharedMemorySize = 0;
+ config.stream = stream;
+
+ auto kernelFn = (accumulateForce == GpuBufferOpsAccumulateForce::True) ?
+ nbnxn_gpu_add_nbat_f_to_f_kernel<true> : nbnxn_gpu_add_nbat_f_to_f_kernel<false>;
+ const float3 *fPtr = adat->f;
+ rvec *frvec = nb->frvec;
+ const int *cell = nb->cell;
+
+ const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config,
+ &fPtr,
+ &frvec,
+ &cell,
+ &atomStart,
+ &nAtoms);
+
+ launchGpuKernel(kernelFn, config, nullptr, "FbufferOps", kernelArgs);
+
+}
+
+void nbnxn_launch_copy_f_to_gpu(const AtomLocality atomLocality,
+ const Nbnxm::GridSet &gridSet,
+ gmx_nbnxn_gpu_t *nb,
+ rvec *f)
+{
+ cudaStream_t stream = atomLocality == AtomLocality::Local ?
+ nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal];
+ bool bDoTime = nb->bDoTime;
+ cu_timers_t *t = nb->timers;
+
+ int atomStart = 0, nAtoms = 0;
+
+ nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms);
+
+ if (bDoTime)
+ {
+ t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
+ }
+
+ rvec *ptrDest = reinterpret_cast<rvec *> (nb->frvec[atomStart]);
+ rvec *ptrSrc = reinterpret_cast<rvec *> (f[atomStart]);
+ //copyToDeviceBuffer(&ptrDest, ptrSrc, 0, nAtoms,
+ // stream, GpuApiCallBehavior::Async, nullptr);
+ //TODO use above API call rather than direct memcpy when force has been implemented in a hostvector
+ cudaMemcpyAsync(ptrDest, ptrSrc, nAtoms*sizeof(rvec), cudaMemcpyHostToDevice,
+ stream);
+
+ if (bDoTime)
+ {
+ t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
+ }
+
+ return;
+}
+
+void nbnxn_launch_copy_f_from_gpu(const AtomLocality atomLocality,
+ const Nbnxm::GridSet &gridSet,
+ gmx_nbnxn_gpu_t *nb,
+ rvec *f)
+{
+ cudaStream_t stream = atomLocality == AtomLocality::Local ?
+ nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal];
+ bool bDoTime = nb->bDoTime;
+ cu_timers_t *t = nb->timers;
+
+ int atomStart = 0, nAtoms = 0;
+
+ nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms);
+
+ if (bDoTime)
+ {
+ t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
+ }
+
+ rvec *ptrDest = reinterpret_cast<rvec *> (f[atomStart]);
+ rvec *ptrSrc = reinterpret_cast<rvec *> (nb->frvec[atomStart]);
+ //copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, nAtoms,
+ // stream, GpuApiCallBehavior::Async, nullptr);
+ //TODO use above API call rather than direct memcpy when force has been implemented in a hostvector
+ cudaMemcpyAsync(ptrDest, ptrSrc, nAtoms*sizeof(rvec), cudaMemcpyDeviceToHost,
+ stream);
+
+ if (bDoTime)
+ {
+ t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
+ }
+
+ return;
+}
+
+void nbnxn_wait_stream_gpu(const AtomLocality gmx_unused atomLocality,
+ gmx_nbnxn_gpu_t *nb)
+{
+
+ cudaStream_t stream = atomLocality == AtomLocality::Local ?
+ nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal];
+
+ cudaStreamSynchronize(stream);
+
+}
+
} // namespace Nbnxm
#include "gromacs/nbnxm/gpu_data_mgmt.h"
#include "gromacs/nbnxm/gridset.h"
#include "gromacs/nbnxm/nbnxm.h"
+#include "gromacs/nbnxm/nbnxm_gpu.h"
#include "gromacs/nbnxm/pairlistsets.h"
#include "gromacs/pbcutil/ishift.h"
#include "gromacs/timing/gpu_timing.h"
nb->ncxy_na_alloc = 0;
nb->ncxy_ind = 0;
nb->ncxy_ind_alloc = 0;
+ nb->nfrvec = 0;
+ nb->nfrvec_alloc = 0;
+ nb->ncell = 0;
+ nb->ncell_alloc = 0;
if (debug)
{
return;
}
+/* Initialization for F buffer operations on GPU. */
+void nbnxn_gpu_init_add_nbat_f_to_f(const int *cell,
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ int natoms_total)
+{
+
+ cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local];
+
+ reallocateDeviceBuffer(&gpu_nbv->frvec, natoms_total, &gpu_nbv->nfrvec, &gpu_nbv->nfrvec_alloc, nullptr);
+
+ if (natoms_total > 0)
+ {
+ reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, nullptr);
+ copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, stream, GpuApiCallBehavior::Async, nullptr);
+ }
+
+ return;
+}
+
} // namespace Nbnxm
int natoms;
//! number of atoms allocated in device buffer
int natoms_alloc;
- //! x buf ops input buffer index mapping
+ //! force in rvec format
+ rvec *frvec;
+ //! number of atoms in force buffer
+ int nfrvec;
+ //! number of atoms allocated in force buffer
+ int nfrvec_alloc;
+ //! f buf ops cell index mapping
+ int *cell;
+ //! number of indices in cell buffer
+ int ncell;
+ //! number of indices allocated in cell buffer
+ int ncell_alloc;
+ //! array of atom indices
int *atomIndices;
//! size of atom indices
int atomIndicesSize;
void nonbonded_verlet_t::setCoordinates(const Nbnxm::AtomLocality locality,
const bool fillLocal,
gmx::ArrayRef<const gmx::RVec> x,
- bool useGpu,
+ BufferOpsUseGpu useGpu,
void *xPmeDevicePtr,
gmx_wallcycle *wcycle)
{
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
- auto fnPtr = useGpu ?
+ auto fnPtr = (useGpu == BufferOpsUseGpu::True) ?
nbnxn_atomdata_copy_x_to_nbat_x<true> :
nbnxn_atomdata_copy_x_to_nbat_x<false>;
}
void
-nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality locality,
- rvec *f,
- gmx_wallcycle *wcycle)
+nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality locality,
+ rvec *f,
+ BufferOpsUseGpu useGpu,
+ GpuBufferOpsAccumulateForce accumulateForce,
+ gmx_wallcycle *wcycle)
{
+
+ GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) &&
+ (accumulateForce == GpuBufferOpsAccumulateForce::True)),
+ "Accumulatation of force is only valid when GPU buffer ops are active");
+
/* Skip the reduction if there was no short-range GPU work to do
* (either NB or both NB and bonded work). */
if (!pairlistIsSimple() && !haveGpuShortRangeWork(locality))
wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
- reduceForces(nbat.get(), locality, pairSearch_->gridSet(), f);
+ auto fn = useGpu == BufferOpsUseGpu::True ? reduceForces<true> : reduceForces<false>;
+ fn(nbat.get(), locality, pairSearch_->gridSet(), f, gpu_nbv, accumulateForce);
+
+ wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
+ wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+}
+
+void
+nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(gmx_wallcycle *wcycle)
+{
+
+ wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+
+ const Nbnxm::GridSet &gridSet = pairSearch_->gridSet();
+
+ Nbnxm::nbnxn_gpu_init_add_nbat_f_to_f(gridSet.cells().data(),
+ gpu_nbv,
+ gridSet.numRealAtomsTotal());
wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
{
Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
}
+
+void nonbonded_verlet_t::launch_copy_f_to_gpu(rvec *f, const Nbnxm::AtomLocality locality)
+{
+ nbnxn_launch_copy_f_to_gpu(locality,
+ pairSearch_->gridSet(),
+ gpu_nbv,
+ f);
+}
+
+void nonbonded_verlet_t::launch_copy_f_from_gpu(rvec *f, const Nbnxm::AtomLocality locality)
+{
+ nbnxn_launch_copy_f_from_gpu(locality,
+ pairSearch_->gridSet(),
+ gpu_nbv,
+ f);
+}
+
+void nonbonded_verlet_t::wait_stream_gpu(const Nbnxm::AtomLocality locality)
+{
+ nbnxn_wait_stream_gpu(locality, gpu_nbv);
+}
+
/*! \endcond */
struct t_forcerec;
struct t_inputrec;
+/*! \brief Switch for whether to use GPU for buffer ops*/
+enum class BufferOpsUseGpu
+{
+ True,
+ False
+};
+
+/*! \brief Switch for whether forces should accumulate in GPU buffer ops */
+enum class GpuBufferOpsAccumulateForce
+{
+ True, // Force should be accumulated and format converted
+ False, // Force should be not accumulated, just format converted
+ Null // GPU buffer ops are not in use, so this object is not applicable
+};
+
+
namespace gmx
{
class MDLogger;
void setCoordinates(Nbnxm::AtomLocality locality,
bool fillLocal,
gmx::ArrayRef<const gmx::RVec> x,
- bool useGpu,
+ BufferOpsUseGpu useGpu,
void *xPmeDevicePtr,
gmx_wallcycle *wcycle);
gmx_wallcycle *wcycle);
//! Add the forces stored in nbat to f, zeros the forces in nbat */
- void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality locality,
- rvec *f,
- gmx_wallcycle *wcycle);
+ void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality locality,
+ rvec *f,
+ BufferOpsUseGpu useGpu,
+ GpuBufferOpsAccumulateForce accumulateForce,
+ gmx_wallcycle *wcycle);
+
+ /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
+ void atomdata_init_add_nbat_f_to_f_gpu(gmx_wallcycle *wcycle);
+
+ /*! \brief H2D transfer of force buffer*/
+ void launch_copy_f_to_gpu(rvec *f, Nbnxm::AtomLocality locality);
+
+ /*! \brief D2H transfer of force buffer*/
+ void launch_copy_f_from_gpu(rvec *f, Nbnxm::AtomLocality locality);
+
+ /*! \brief Host sync on device stream given by locality */
+ void wait_stream_gpu(Nbnxm::AtomLocality locality);
//! Return the kernel setup
const Nbnxm::KernelSetup &kernelSetup() const
struct nbnxn_atomdata_t;
enum class GpuTaskCompletion;
+enum class GpuBufferOpsAccumulateForce;
namespace gmx
{
void gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb,
nbnxn_atomdata_t gmx_unused *nbatom,
int gmx_unused flags,
- AtomLocality gmx_unused aloc) GPU_FUNC_TERM
+ AtomLocality gmx_unused aloc,
+ const bool gmx_unused copyBackNbForce) GPU_FUNC_TERM
/*! \brief Attempts to complete nonbonded GPU task.
*
bool haveGpuShortRangeWork(const gmx_nbnxn_gpu_t gmx_unused *nb,
const Nbnxm::AtomLocality gmx_unused aLocality) GPU_FUNC_TERM_WITH_RETURN(false)
+/*! \brief Initialization for F buffer operations on GPU */
+CUDA_FUNC_QUALIFIER
+void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused *cell,
+ gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
+ int gmx_unused natoms_total) CUDA_FUNC_TERM
+
+/*! \brief F buffer operations on GPU: adds nb format force to rvec format. */
+CUDA_FUNC_QUALIFIER
+void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality gmx_unused atomLocality,
+ gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
+ int gmx_unused atomStart,
+ int gmx_unused nAtoms,
+ GpuBufferOpsAccumulateForce gmx_unused accumulateForce) CUDA_FUNC_TERM
+
+/*! \brief Copy force buffer from CPU to GPU */
+CUDA_FUNC_QUALIFIER
+void nbnxn_launch_copy_f_to_gpu(const AtomLocality gmx_unused atomLocality,
+ const Nbnxm::GridSet gmx_unused &gridSet,
+ gmx_nbnxn_gpu_t gmx_unused *nb,
+ rvec gmx_unused *f) CUDA_FUNC_TERM
+
+/*! \brief Copy force buffer from GPU to CPU */
+CUDA_FUNC_QUALIFIER
+void nbnxn_launch_copy_f_from_gpu(const AtomLocality gmx_unused atomLocality,
+ const Nbnxm::GridSet gmx_unused &gridSet,
+ gmx_nbnxn_gpu_t gmx_unused *nb,
+ rvec gmx_unused *f) CUDA_FUNC_TERM
+
+/*! \brief Wait for GPU stream to complete */
+CUDA_FUNC_QUALIFIER
+void nbnxn_wait_stream_gpu(const AtomLocality gmx_unused atomLocality,
+ gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM
+
-} // namespace Nbnxm
+} // namespace Nbnxm
#endif
* Launch asynchronously the download of nonbonded forces from the GPU
* (and energies/shift forces if required).
*/
-void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb,
- struct nbnxn_atomdata_t *nbatom,
- const int flags,
- const AtomLocality aloc)
+void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb,
+ struct nbnxn_atomdata_t *nbatom,
+ const int flags,
+ const AtomLocality aloc,
+ const bool gmx_unused copyBackNbForce)
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");