From 8e83edea6ef6a77c059f913c77eb3834f87f6abc Mon Sep 17 00:00:00 2001 From: Alan Gray Date: Fri, 8 Mar 2019 01:05:47 -0800 Subject: [PATCH] F buffer operations in CUDA This patch performs GPU buffer ops for force buffers. Enable with GMX_USE_GPU_BUFFER_OPS env variable. Currently, the H2D transfer of the force buffer is switched on with haveSpecialForces || haveCpuBondedWork || haveCpuPmeWork, where haveCpuPmeWork is true even when useGpuPme == true until on-GPU PME-nonbonded reduction is added in follow-up. TODO: enable PME reduction in GPU buffer ops and remove associated H2D transfer Implements part of #2817 Change-Id: Ice984425301d24bac1340e883698244489cd686e --- src/gromacs/mdlib/sim_util.cpp | 100 +++++++++--- src/gromacs/nbnxm/atomdata.cpp | 135 +++++++++++------ src/gromacs/nbnxm/atomdata.h | 37 ++++- .../nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh | 51 ++++++- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 142 +++++++++++++++++- .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 24 +++ src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h | 14 +- src/gromacs/nbnxm/nbnxm.cpp | 59 +++++++- src/gromacs/nbnxm/nbnxm.h | 38 ++++- src/gromacs/nbnxm/nbnxm_gpu.h | 39 ++++- src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp | 9 +- 11 files changed, 551 insertions(+), 97 deletions(-) diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 8ab56d27ce..766816c4e7 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -120,12 +120,11 @@ // PME-first ordering would suffice). static const bool c_disableAlternatingWait = (getenv("GMX_DISABLE_ALTERNATING_GPU_WAIT") != nullptr); -// environment variable to enable GPU buffer ops, to alow incremental and optional +// environment variable to enable GPU buffer ops, to allow incremental and optional // introduction of this functionality. // TODO eventially tie this in with other existing GPU flags. static const bool c_enableGpuBufOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr); - static void sum_forces(rvec f[], gmx::ArrayRef forceToAdd) { const int end = forceToAdd.size(); @@ -692,7 +691,10 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L); nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local, - as_rvec_array(force->unpaddedArrayRef().data()), wcycle); + as_rvec_array(force->unpaddedArrayRef().data()), + BufferOpsUseGpu::False, + GpuBufferOpsAccumulateForce::Null, + wcycle); } } } @@ -862,8 +864,12 @@ void do_force(FILE *fplog, ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) | ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) | ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0); + const BufferOpsUseGpu useGpuFBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA)) + && !(flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) ? + BufferOpsUseGpu::True : BufferOpsUseGpu::False; - const bool useGpuXBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA)); + const BufferOpsUseGpu useGpuXBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA)) ? + BufferOpsUseGpu::True : BufferOpsUseGpu::False;; /* At a search step we need to start the first balancing region * somewhere early inside the step after communication during domain @@ -1037,11 +1043,17 @@ void do_force(FILE *fplog, wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL); wallcycle_stop(wcycle, ewcNS); - if (useGpuXBufOps) + if (useGpuXBufOps == BufferOpsUseGpu::True) { nbv->atomdata_init_copy_x_to_nbat_x_gpu(); } - + // For force buffer ops, we use the below conditon rather than + // useGpuFBufOps to ensure that init is performed even if this + // NS step is also a virial step (on which f buf ops are deactivated). + if (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA)) + { + nbv->atomdata_init_add_nbat_f_to_f_gpu(wcycle); + } } else { @@ -1057,7 +1069,7 @@ void do_force(FILE *fplog, wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED); Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get()); - if (bNS || !useGpuXBufOps) + if (bNS || (useGpuXBufOps == BufferOpsUseGpu::False)) { Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), Nbnxm::AtomLocality::Local); @@ -1121,7 +1133,7 @@ void do_force(FILE *fplog, { wallcycle_start(wcycle, ewcLAUNCH_GPU); - if (bNS || !useGpuXBufOps) + if (bNS || (useGpuXBufOps == BufferOpsUseGpu::False)) { wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED); Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), @@ -1151,13 +1163,17 @@ void do_force(FILE *fplog, /* launch D2H copy-back F */ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED); + + bool copyBackNbForce = (useGpuFBufOps == BufferOpsUseGpu::False); + if (havePPDomainDecomposition(cr)) { Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), - flags, Nbnxm::AtomLocality::NonLocal); + flags, Nbnxm::AtomLocality::NonLocal, copyBackNbForce); } Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), - flags, Nbnxm::AtomLocality::Local); + flags, Nbnxm::AtomLocality::Local, copyBackNbForce); + wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); if (ppForceWorkload->haveGpuBondedWork && (flags & GMX_FORCE_ENERGY)) @@ -1272,8 +1288,10 @@ void do_force(FILE *fplog, * communication with calculation with domain decomposition. */ wallcycle_stop(wcycle, ewcFORCE); - - nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f, wcycle); + nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f, + BufferOpsUseGpu::False, + GpuBufferOpsAccumulateForce::Null, + wcycle); wallcycle_start_nocount(wcycle, ewcFORCE); @@ -1309,6 +1327,21 @@ void do_force(FILE *fplog, flags, &forceOut.forceWithVirial, enerd, ed, bNS); + // flag to specify if CPU force output is preset in force + // buffer. For now, this is true even when useGpuPme == true + // (because on-GPU PME-nonbonded reduction will be added in + // follow-up) + // TODO adapt the below when on-GPU PME-nonbonded reduction is available. + bool useCpuPmeReduction = true; + bool haveCpuForces = (ppForceWorkload->haveSpecialForces || ppForceWorkload->haveCpuListedForceWork || useCpuPmeReduction); + // flag to specify if forces should be accumulated in force buffer + // ops. For now, this is solely determined by above haveCpuForces + // flag, but in future developments it will also depend on + // e.g. whether the GPU force halo exchange is active. + GpuBufferOpsAccumulateForce accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) && + haveCpuForces ? GpuBufferOpsAccumulateForce::True : + GpuBufferOpsAccumulateForce::False; + // Will store the amount of cycles spent waiting for the GPU that // will be later used in the DLB accounting. float cycles_wait_gpu = 0; @@ -1335,8 +1368,16 @@ void do_force(FILE *fplog, wallcycle_stop(wcycle, ewcFORCE); } + if (useGpuFBufOps == BufferOpsUseGpu::True && haveCpuForces) + { + nbv->launch_copy_f_to_gpu(forceOut.f, Nbnxm::AtomLocality::NonLocal); + } nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal, - forceOut.f, wcycle); + forceOut.f, useGpuFBufOps, accumulateForce, wcycle); + if (useGpuFBufOps == BufferOpsUseGpu::True) + { + nbv->launch_copy_f_from_gpu(forceOut.f, Nbnxm::AtomLocality::NonLocal); + } if (fr->nbv->emulateGpu() && (flags & GMX_FORCE_VIRIAL)) { @@ -1357,13 +1398,18 @@ void do_force(FILE *fplog, if (bDoForces) { + if (useGpuFBufOps == BufferOpsUseGpu::True) + { + nbv->wait_stream_gpu(Nbnxm::AtomLocality::NonLocal); + } dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle); } } // With both nonbonded and PME offloaded a GPU on the same rank, we use // an alternating wait/reduction scheme. - bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr)); + bool alternateGpuWait = (!c_disableAlternatingWait && useGpuPme && bUseGPU && !DOMAINDECOMP(cr) && + (useGpuFBufOps == BufferOpsUseGpu::False)); if (alternateGpuWait) { alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &force, &forceOut.forceWithVirial, fr->fshift, enerd, @@ -1426,6 +1472,25 @@ void do_force(FILE *fplog, pme_gpu_reinit_computation(fr->pmedata, wcycle); } + /* Do the nonbonded GPU (or emulation) force buffer reduction + * on the non-alternating path. */ + if (bUseOrEmulGPU && !alternateGpuWait) + { + + if (useGpuFBufOps == BufferOpsUseGpu::True && haveCpuForces) + { + nbv->launch_copy_f_to_gpu(forceOut.f, Nbnxm::AtomLocality::Local); + } + nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local, + forceOut.f, useGpuFBufOps, accumulateForce, wcycle); + if (useGpuFBufOps == BufferOpsUseGpu::True) + { + nbv->launch_copy_f_from_gpu(forceOut.f, Nbnxm::AtomLocality::Local); + nbv->wait_stream_gpu(Nbnxm::AtomLocality::Local); + } + + } + if (bUseGPU) { /* now clear the GPU outputs while we finish the step on the CPU */ @@ -1457,13 +1522,6 @@ void do_force(FILE *fplog, wallcycle_stop(wcycle, ewcLAUNCH_GPU); } - /* Do the nonbonded GPU (or emulation) force buffer reduction - * on the non-alternating path. */ - if (bUseOrEmulGPU && !alternateGpuWait) - { - nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local, - forceOut.f, wcycle); - } if (DOMAINDECOMP(cr)) { dd_force_flop_stop(cr->dd, nrnb); diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp index c1271d8bba..8288a1d227 100644 --- a/src/gromacs/nbnxm/atomdata.cpp +++ b/src/gromacs/nbnxm/atomdata.cpp @@ -1010,6 +1010,7 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet &gridSet, { int gridBegin = 0; int gridEnd = 0; + switch (locality) { case Nbnxm::AtomLocality::All: @@ -1437,32 +1438,20 @@ static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbnxn_atomdata_t *nbat, } } + /* Add the force array(s) from nbnxn_atomdata_t to f */ -void reduceForces(nbnxn_atomdata_t *nbat, - const Nbnxm::AtomLocality locality, - const Nbnxm::GridSet &gridSet, - rvec *f) +template +void reduceForces(nbnxn_atomdata_t *nbat, + const Nbnxm::AtomLocality locality, + const Nbnxm::GridSet &gridSet, + rvec *f, + gmx_nbnxn_gpu_t *gpu_nbv, + GpuBufferOpsAccumulateForce accumulateForce) { int a0 = 0; int na = 0; - switch (locality) - { - case Nbnxm::AtomLocality::All: - a0 = 0; - na = gridSet.numRealAtomsTotal(); - break; - case Nbnxm::AtomLocality::Local: - a0 = 0; - na = gridSet.numRealAtomsLocal(); - break; - case Nbnxm::AtomLocality::NonLocal: - a0 = gridSet.numRealAtomsLocal(); - na = gridSet.numRealAtomsTotal() - gridSet.numRealAtomsLocal(); - break; - case Nbnxm::AtomLocality::Count: - GMX_ASSERT(false, "Count is invalid locality specifier"); - break; - } + + nbnxn_get_atom_range(locality, gridSet, &a0, &na); if (na == 0) { @@ -1470,41 +1459,67 @@ void reduceForces(nbnxn_atomdata_t *nbat, return; } - int nth = gmx_omp_nthreads_get(emntNonbonded); + if (useGpu) + { + Nbnxm::nbnxn_gpu_add_nbat_f_to_f(locality, + gpu_nbv, + a0, na, + accumulateForce); - if (nbat->out.size() > 1) + } + else { - if (locality != Nbnxm::AtomLocality::All) - { - gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll"); - } + int nth = gmx_omp_nthreads_get(emntNonbonded); - /* Reduce the force thread output buffers into buffer 0, before adding - * them to the, differently ordered, "real" force buffer. - */ - if (nbat->bUseTreeReduce) + if (nbat->out.size() > 1) { - nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat, nth); - } - else - { - nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat, nth); + if (locality != Nbnxm::AtomLocality::All) + { + gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll"); + } + + /* Reduce the force thread output buffers into buffer 0, before adding + * them to the, differently ordered, "real" force buffer. + */ + if (nbat->bUseTreeReduce) + { + nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat, nth); + } + else + { + nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat, nth); + } } - } #pragma omp parallel for num_threads(nth) schedule(static) - for (int th = 0; th < nth; th++) - { - try + for (int th = 0; th < nth; th++) { - nbnxn_atomdata_add_nbat_f_to_f_part(gridSet, *nbat, - nbat->out[0], - a0 + ((th + 0)*na)/nth, - a0 + ((th + 1)*na)/nth, - f); + try + { + nbnxn_atomdata_add_nbat_f_to_f_part(gridSet, *nbat, + nbat->out[0], + a0 + ((th + 0)*na)/nth, + a0 + ((th + 1)*na)/nth, + f); + } + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; } - GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; } } +template +void reduceForces(nbnxn_atomdata_t *nbat, + const Nbnxm::AtomLocality locality, + const Nbnxm::GridSet &gridSet, + rvec *f, + gmx_nbnxn_gpu_t *gpu_nbv, + GpuBufferOpsAccumulateForce accumulateForce); + +template +void reduceForces(nbnxn_atomdata_t *nbat, + const Nbnxm::AtomLocality locality, + const Nbnxm::GridSet &gridSet, + rvec *f, + gmx_nbnxn_gpu_t *gpu_nbv, + GpuBufferOpsAccumulateForce accumulateForce); void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat, rvec *fshift) @@ -1524,3 +1539,29 @@ void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat, rvec_inc(fshift[s], sum); } } + +void nbnxn_get_atom_range(const Nbnxm::AtomLocality atomLocality, + const Nbnxm::GridSet &gridSet, + int *atomStart, + int *nAtoms) +{ + + switch (atomLocality) + { + case Nbnxm::AtomLocality::All: + *atomStart = 0; + *nAtoms = gridSet.numRealAtomsTotal(); + break; + case Nbnxm::AtomLocality::Local: + *atomStart = 0; + *nAtoms = gridSet.numRealAtomsLocal(); + break; + case Nbnxm::AtomLocality::NonLocal: + *atomStart = gridSet.numRealAtomsLocal(); + *nAtoms = gridSet.numRealAtomsTotal() - gridSet.numRealAtomsLocal(); + break; + case Nbnxm::AtomLocality::Count: + GMX_ASSERT(false, "Count is invalid locality specifier"); + break; + } +} diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h index 4e4e0c0a55..d627732014 100644 --- a/src/gromacs/nbnxm/atomdata.h +++ b/src/gromacs/nbnxm/atomdata.h @@ -57,12 +57,16 @@ struct nonbonded_verlet_t; struct t_mdatoms; struct tMPI_Atomic; +enum class BufferOpsUseGpu; + namespace Nbnxm { class GridSet; enum class KernelType; } +enum class GpuBufferOpsAccumulateForce; + /* Convenience type for vector with aligned memory */ template using AlignedVector = std::vector < T, gmx::AlignedAllocator < T>>; @@ -335,13 +339,38 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet &, void *); //! Add the computed forces to \p f, an internal reduction might be performed as well -void reduceForces(nbnxn_atomdata_t *nbat, - Nbnxm::AtomLocality locality, - const Nbnxm::GridSet &gridSet, - rvec *f); +template +void reduceForces(nbnxn_atomdata_t *nbat, + Nbnxm::AtomLocality locality, + const Nbnxm::GridSet &gridSet, + rvec *f, + gmx_nbnxn_gpu_t *gpu_nbv, + GpuBufferOpsAccumulateForce accumulateForce); + + +extern template +void reduceForces(nbnxn_atomdata_t *nbat, + const Nbnxm::AtomLocality locality, + const Nbnxm::GridSet &gridSet, + rvec *f, + gmx_nbnxn_gpu_t *gpu_nbv, + GpuBufferOpsAccumulateForce accumulateForce); + +extern template +void reduceForces(nbnxn_atomdata_t *nbat, + const Nbnxm::AtomLocality locality, + const Nbnxm::GridSet &gridSet, + rvec *f, + gmx_nbnxn_gpu_t *gpu_nbv, + GpuBufferOpsAccumulateForce accumulateForce); /* Add the fshift force stored in nbat to fshift */ void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat, rvec *fshift); +/* Get the atom start index and number of atoms for a given locality */ +void nbnxn_get_atom_range(Nbnxm::AtomLocality atomLocality, + const Nbnxm::GridSet &gridSet, + int *atomStart, + int *nAtoms); #endif diff --git a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh index 172d222148..09d6e37243 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh @@ -36,8 +36,7 @@ /*! \internal \file * * \brief - * CUDA kernel for GPU version of copy_rvec_to_nbat_real. - * Converts coordinate data from rvec to nb format. + * CUDA kernels for GPU versions of copy_rvec_to_nbat_real and add_nbat_f_to_f. * * \author Alan Gray * \author Jon Vincent @@ -137,3 +136,51 @@ __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int numColu } } + +/*! \brief CUDA kernel to add part of the force array(s) from nbnxn_atomdata_t to f + * + * \param[in] fnb Force in nbat format + * \param[in,out] f Force buffer to be reduced into + * \param[in] cell Cell index mapping + * \param[in] a0 start atom index + * \param[in] a1 end atom index + * \param[in] stride stride between atoms in memory + */ +template +__global__ void +nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb, + rvec * f, + const int *__restrict__ cell, + const int atomStart, + const int nAtoms); +template +__global__ void +nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb, + rvec * f, + const int *__restrict__ cell, + const int atomStart, + const int nAtoms) +{ + + /* map particle-level parallelism to 1D CUDA thread and block index */ + int threadIndex = blockIdx.x*blockDim.x+threadIdx.x; + + /* perform addition for each particle*/ + if (threadIndex < nAtoms) + { + + int i = cell[atomStart+threadIndex]; + float3 *f_dest = (float3 *)&f[atomStart+threadIndex][XX]; + + if (accumulateForce) + { + *f_dest += fnb[i]; + } + else + { + *f_dest = fnb[i]; + } + + } + return; +} diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index e8e6c5b5cc..e48713c7ce 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -54,6 +54,7 @@ #include "nbnxm_cuda.h" #include "gromacs/gpu_utils/cudautils.cuh" +#include "gromacs/gpu_utils/vectype_ops.cuh" #include "gromacs/mdlib/force_flags.h" #include "gromacs/nbnxm/atomdata.h" #include "gromacs/nbnxm/gpu_common.h" @@ -113,6 +114,10 @@ namespace Nbnxm { +//! Number of CUDA threads in a block +//TODO Optimize this through experimentation +constexpr static int c_bufOpsThreadsPerBlock = 128; + /*! Nonbonded kernel function pointer type */ typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t, const cu_nbparam_t, @@ -640,7 +645,8 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t *nb, void gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, nbnxn_atomdata_t *nbatom, const int flags, - const AtomLocality atomLocality) + const AtomLocality atomLocality, + const bool copyBackNbForce) { GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); @@ -682,8 +688,11 @@ void gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, } /* DtoH f */ - cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin, - (adat_len)*sizeof(*adat->f), stream); + if (copyBackNbForce) + { + cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin, + (adat_len)*sizeof(*adat->f), stream); + } /* After the non-local D2H is launched the nonlocal_done event can be recorded which signals that the local D2H can proceed. This event is not @@ -805,13 +814,12 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid, } /* launch kernel on GPU */ - const int threadsPerBlock = 128; KernelLaunchConfig config; - config.blockSize[0] = threadsPerBlock; + config.blockSize[0] = c_bufOpsThreadsPerBlock; config.blockSize[1] = 1; config.blockSize[2] = 1; - config.gridSize[0] = (grid.numCellsColumnMax()*numAtomsPerCell + threadsPerBlock - 1)/threadsPerBlock; + config.gridSize[0] = (grid.numCellsColumnMax()*numAtomsPerCell + c_bufOpsThreadsPerBlock - 1)/c_bufOpsThreadsPerBlock; config.gridSize[1] = numColumns; config.gridSize[2] = 1; GMX_ASSERT(config.gridSize[0] > 0, "Can not have empty grid, early return above avoids this"); @@ -838,4 +846,126 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid, nbnxnInsertNonlocalGpuDependency(nb, interactionLoc); } +/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */ +void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality, + gmx_nbnxn_gpu_t *nb, + int atomStart, + int nAtoms, + GpuBufferOpsAccumulateForce accumulateForce) +{ + + cu_atomdata_t *adat = nb->atdat; + cudaStream_t stream = atomLocality == AtomLocality::Local ? + nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal]; + + /* launch kernel */ + + KernelLaunchConfig config; + config.blockSize[0] = c_bufOpsThreadsPerBlock; + config.blockSize[1] = 1; + config.blockSize[2] = 1; + config.gridSize[0] = ((nAtoms+1)+c_bufOpsThreadsPerBlock-1)/c_bufOpsThreadsPerBlock; + config.gridSize[1] = 1; + config.gridSize[2] = 1; + config.sharedMemorySize = 0; + config.stream = stream; + + auto kernelFn = (accumulateForce == GpuBufferOpsAccumulateForce::True) ? + nbnxn_gpu_add_nbat_f_to_f_kernel : nbnxn_gpu_add_nbat_f_to_f_kernel; + const float3 *fPtr = adat->f; + rvec *frvec = nb->frvec; + const int *cell = nb->cell; + + const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, + &fPtr, + &frvec, + &cell, + &atomStart, + &nAtoms); + + launchGpuKernel(kernelFn, config, nullptr, "FbufferOps", kernelArgs); + +} + +void nbnxn_launch_copy_f_to_gpu(const AtomLocality atomLocality, + const Nbnxm::GridSet &gridSet, + gmx_nbnxn_gpu_t *nb, + rvec *f) +{ + cudaStream_t stream = atomLocality == AtomLocality::Local ? + nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal]; + bool bDoTime = nb->bDoTime; + cu_timers_t *t = nb->timers; + + int atomStart = 0, nAtoms = 0; + + nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms); + + if (bDoTime) + { + t->xf[atomLocality].nb_h2d.openTimingRegion(stream); + } + + rvec *ptrDest = reinterpret_cast (nb->frvec[atomStart]); + rvec *ptrSrc = reinterpret_cast (f[atomStart]); + //copyToDeviceBuffer(&ptrDest, ptrSrc, 0, nAtoms, + // stream, GpuApiCallBehavior::Async, nullptr); + //TODO use above API call rather than direct memcpy when force has been implemented in a hostvector + cudaMemcpyAsync(ptrDest, ptrSrc, nAtoms*sizeof(rvec), cudaMemcpyHostToDevice, + stream); + + if (bDoTime) + { + t->xf[atomLocality].nb_h2d.closeTimingRegion(stream); + } + + return; +} + +void nbnxn_launch_copy_f_from_gpu(const AtomLocality atomLocality, + const Nbnxm::GridSet &gridSet, + gmx_nbnxn_gpu_t *nb, + rvec *f) +{ + cudaStream_t stream = atomLocality == AtomLocality::Local ? + nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal]; + bool bDoTime = nb->bDoTime; + cu_timers_t *t = nb->timers; + + int atomStart = 0, nAtoms = 0; + + nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms); + + if (bDoTime) + { + t->xf[atomLocality].nb_d2h.openTimingRegion(stream); + } + + rvec *ptrDest = reinterpret_cast (f[atomStart]); + rvec *ptrSrc = reinterpret_cast (nb->frvec[atomStart]); + //copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, nAtoms, + // stream, GpuApiCallBehavior::Async, nullptr); + //TODO use above API call rather than direct memcpy when force has been implemented in a hostvector + cudaMemcpyAsync(ptrDest, ptrSrc, nAtoms*sizeof(rvec), cudaMemcpyDeviceToHost, + stream); + + if (bDoTime) + { + t->xf[atomLocality].nb_d2h.closeTimingRegion(stream); + } + + return; +} + +void nbnxn_wait_stream_gpu(const AtomLocality gmx_unused atomLocality, + gmx_nbnxn_gpu_t *nb) +{ + + cudaStream_t stream = atomLocality == AtomLocality::Local ? + nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal]; + + cudaStreamSynchronize(stream); + +} + } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index 6b0845219c..a69f758a27 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -61,6 +61,7 @@ #include "gromacs/nbnxm/gpu_data_mgmt.h" #include "gromacs/nbnxm/gridset.h" #include "gromacs/nbnxm/nbnxm.h" +#include "gromacs/nbnxm/nbnxm_gpu.h" #include "gromacs/nbnxm/pairlistsets.h" #include "gromacs/pbcutil/ishift.h" #include "gromacs/timing/gpu_timing.h" @@ -503,6 +504,10 @@ gpu_init(const gmx_device_info_t *deviceInfo, nb->ncxy_na_alloc = 0; nb->ncxy_ind = 0; nb->ncxy_ind_alloc = 0; + nb->nfrvec = 0; + nb->nfrvec_alloc = 0; + nb->ncell = 0; + nb->ncell_alloc = 0; if (debug) { @@ -956,4 +961,23 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet &gridSet, return; } +/* Initialization for F buffer operations on GPU. */ +void nbnxn_gpu_init_add_nbat_f_to_f(const int *cell, + gmx_nbnxn_gpu_t *gpu_nbv, + int natoms_total) +{ + + cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local]; + + reallocateDeviceBuffer(&gpu_nbv->frvec, natoms_total, &gpu_nbv->nfrvec, &gpu_nbv->nfrvec_alloc, nullptr); + + if (natoms_total > 0) + { + reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, nullptr); + copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, stream, GpuApiCallBehavior::Async, nullptr); + } + + return; +} + } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index ff8705df59..c18cd3f9a2 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -223,7 +223,19 @@ struct gmx_nbnxn_cuda_t int natoms; //! number of atoms allocated in device buffer int natoms_alloc; - //! x buf ops input buffer index mapping + //! force in rvec format + rvec *frvec; + //! number of atoms in force buffer + int nfrvec; + //! number of atoms allocated in force buffer + int nfrvec_alloc; + //! f buf ops cell index mapping + int *cell; + //! number of indices in cell buffer + int ncell; + //! number of indices allocated in cell buffer + int ncell_alloc; + //! array of atom indices int *atomIndices; //! size of atom indices int atomIndicesSize; diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index a81a6b400e..f7208272b5 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -135,14 +135,14 @@ void nonbonded_verlet_t::setAtomProperties(const t_mdatoms &mdatoms, void nonbonded_verlet_t::setCoordinates(const Nbnxm::AtomLocality locality, const bool fillLocal, gmx::ArrayRef x, - bool useGpu, + BufferOpsUseGpu useGpu, void *xPmeDevicePtr, gmx_wallcycle *wcycle) { wallcycle_start(wcycle, ewcNB_XF_BUF_OPS); wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS); - auto fnPtr = useGpu ? + auto fnPtr = (useGpu == BufferOpsUseGpu::True) ? nbnxn_atomdata_copy_x_to_nbat_x : nbnxn_atomdata_copy_x_to_nbat_x; @@ -160,10 +160,17 @@ gmx::ArrayRef nonbonded_verlet_t::getGridIndices() const } void -nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality locality, - rvec *f, - gmx_wallcycle *wcycle) +nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality locality, + rvec *f, + BufferOpsUseGpu useGpu, + GpuBufferOpsAccumulateForce accumulateForce, + gmx_wallcycle *wcycle) { + + GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) && + (accumulateForce == GpuBufferOpsAccumulateForce::True)), + "Accumulatation of force is only valid when GPU buffer ops are active"); + /* Skip the reduction if there was no short-range GPU work to do * (either NB or both NB and bonded work). */ if (!pairlistIsSimple() && !haveGpuShortRangeWork(locality)) @@ -174,7 +181,25 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality locality wallcycle_start(wcycle, ewcNB_XF_BUF_OPS); wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS); - reduceForces(nbat.get(), locality, pairSearch_->gridSet(), f); + auto fn = useGpu == BufferOpsUseGpu::True ? reduceForces : reduceForces; + fn(nbat.get(), locality, pairSearch_->gridSet(), f, gpu_nbv, accumulateForce); + + wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS); + wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS); +} + +void +nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(gmx_wallcycle *wcycle) +{ + + wallcycle_start(wcycle, ewcNB_XF_BUF_OPS); + wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS); + + const Nbnxm::GridSet &gridSet = pairSearch_->gridSet(); + + Nbnxm::nbnxn_gpu_init_add_nbat_f_to_f(gridSet.cells().data(), + gpu_nbv, + gridSet.numRealAtomsTotal()); wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS); wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS); @@ -206,4 +231,26 @@ void nonbonded_verlet_t::insertNonlocalGpuDependency(const Nbnxm::InteractionLoc { Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality); } + +void nonbonded_verlet_t::launch_copy_f_to_gpu(rvec *f, const Nbnxm::AtomLocality locality) +{ + nbnxn_launch_copy_f_to_gpu(locality, + pairSearch_->gridSet(), + gpu_nbv, + f); +} + +void nonbonded_verlet_t::launch_copy_f_from_gpu(rvec *f, const Nbnxm::AtomLocality locality) +{ + nbnxn_launch_copy_f_from_gpu(locality, + pairSearch_->gridSet(), + gpu_nbv, + f); +} + +void nonbonded_verlet_t::wait_stream_gpu(const Nbnxm::AtomLocality locality) +{ + nbnxn_wait_stream_gpu(locality, gpu_nbv); +} + /*! \endcond */ diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index 7e3869b451..1e177a57a0 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -128,6 +128,22 @@ struct t_nrnb; struct t_forcerec; struct t_inputrec; +/*! \brief Switch for whether to use GPU for buffer ops*/ +enum class BufferOpsUseGpu +{ + True, + False +}; + +/*! \brief Switch for whether forces should accumulate in GPU buffer ops */ +enum class GpuBufferOpsAccumulateForce +{ + True, // Force should be accumulated and format converted + False, // Force should be not accumulated, just format converted + Null // GPU buffer ops are not in use, so this object is not applicable +}; + + namespace gmx { class MDLogger; @@ -244,7 +260,7 @@ struct nonbonded_verlet_t void setCoordinates(Nbnxm::AtomLocality locality, bool fillLocal, gmx::ArrayRef x, - bool useGpu, + BufferOpsUseGpu useGpu, void *xPmeDevicePtr, gmx_wallcycle *wcycle); @@ -297,9 +313,23 @@ struct nonbonded_verlet_t gmx_wallcycle *wcycle); //! Add the forces stored in nbat to f, zeros the forces in nbat */ - void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality locality, - rvec *f, - gmx_wallcycle *wcycle); + void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality locality, + rvec *f, + BufferOpsUseGpu useGpu, + GpuBufferOpsAccumulateForce accumulateForce, + gmx_wallcycle *wcycle); + + /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */ + void atomdata_init_add_nbat_f_to_f_gpu(gmx_wallcycle *wcycle); + + /*! \brief H2D transfer of force buffer*/ + void launch_copy_f_to_gpu(rvec *f, Nbnxm::AtomLocality locality); + + /*! \brief D2H transfer of force buffer*/ + void launch_copy_f_from_gpu(rvec *f, Nbnxm::AtomLocality locality); + + /*! \brief Host sync on device stream given by locality */ + void wait_stream_gpu(Nbnxm::AtomLocality locality); //! Return the kernel setup const Nbnxm::KernelSetup &kernelSetup() const diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index a1e4800456..3295fdb1f9 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -54,6 +54,7 @@ struct nbnxn_atomdata_t; enum class GpuTaskCompletion; +enum class GpuBufferOpsAccumulateForce; namespace gmx { @@ -143,7 +144,8 @@ GPU_FUNC_QUALIFIER void gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb, nbnxn_atomdata_t gmx_unused *nbatom, int gmx_unused flags, - AtomLocality gmx_unused aloc) GPU_FUNC_TERM + AtomLocality gmx_unused aloc, + const bool gmx_unused copyBackNbForce) GPU_FUNC_TERM /*! \brief Attempts to complete nonbonded GPU task. * @@ -268,7 +270,40 @@ GPU_FUNC_QUALIFIER bool haveGpuShortRangeWork(const gmx_nbnxn_gpu_t gmx_unused *nb, const Nbnxm::AtomLocality gmx_unused aLocality) GPU_FUNC_TERM_WITH_RETURN(false) +/*! \brief Initialization for F buffer operations on GPU */ +CUDA_FUNC_QUALIFIER +void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused *cell, + gmx_nbnxn_gpu_t gmx_unused *gpu_nbv, + int gmx_unused natoms_total) CUDA_FUNC_TERM + +/*! \brief F buffer operations on GPU: adds nb format force to rvec format. */ +CUDA_FUNC_QUALIFIER +void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality gmx_unused atomLocality, + gmx_nbnxn_gpu_t gmx_unused *gpu_nbv, + int gmx_unused atomStart, + int gmx_unused nAtoms, + GpuBufferOpsAccumulateForce gmx_unused accumulateForce) CUDA_FUNC_TERM + +/*! \brief Copy force buffer from CPU to GPU */ +CUDA_FUNC_QUALIFIER +void nbnxn_launch_copy_f_to_gpu(const AtomLocality gmx_unused atomLocality, + const Nbnxm::GridSet gmx_unused &gridSet, + gmx_nbnxn_gpu_t gmx_unused *nb, + rvec gmx_unused *f) CUDA_FUNC_TERM + +/*! \brief Copy force buffer from GPU to CPU */ +CUDA_FUNC_QUALIFIER +void nbnxn_launch_copy_f_from_gpu(const AtomLocality gmx_unused atomLocality, + const Nbnxm::GridSet gmx_unused &gridSet, + gmx_nbnxn_gpu_t gmx_unused *nb, + rvec gmx_unused *f) CUDA_FUNC_TERM + +/*! \brief Wait for GPU stream to complete */ +CUDA_FUNC_QUALIFIER +void nbnxn_wait_stream_gpu(const AtomLocality gmx_unused atomLocality, + gmx_nbnxn_gpu_t gmx_unused *nb) CUDA_FUNC_TERM + -} // namespace Nbnxm +} // namespace Nbnxm #endif diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index 659c484287..3ac8491795 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -731,10 +731,11 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb, * Launch asynchronously the download of nonbonded forces from the GPU * (and energies/shift forces if required). */ -void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, - struct nbnxn_atomdata_t *nbatom, - const int flags, - const AtomLocality aloc) +void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, + struct nbnxn_atomdata_t *nbatom, + const int flags, + const AtomLocality aloc, + const bool gmx_unused copyBackNbForce) { GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); -- 2.22.0