From: Artem Zhmurov Date: Wed, 11 Sep 2019 12:19:31 +0000 (+0200) Subject: Decouple GPU force buffer management from buffer ops in NBNXM X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=873bf0800f52e3d6c5998dae27e64f1f6e18235a;p=alexxy%2Fgromacs.git Decouple GPU force buffer management from buffer ops in NBNXM When GPU-side buffer operations are used, the total forces on the device are accumulated in NBNXM module in the local GPU buffer. By decoupling the CPU and GPU buffer operations and making the force buffer into an argument for the reduction function, this commit allows to take the responsibility of the GPU forces management from the NBNXM module to the third-party instance. This commit is refactoring of the code in preparation for the introduction of the GPU-side PropagatorStateData object. TODO: Use DeviceBuffer when passing the PME GPU forces buffer. Refs. #2816 Change-Id: I2a1f9d12fad3fb5b2ce37ca3ed3d0cb91777c468 --- diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 853f97b021..768ca45777 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1455,14 +1455,22 @@ void do_force(FILE *fplog, // flag to specify if forces should be accumulated in force buffer // ops. For non-local part, this just depends on whether CPU forces are present. bool accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) && haveCpuForces; - nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal, - forceWithShiftForces.force(), pme_gpu_get_device_f(fr->pmedata), - pme_gpu_get_f_ready_synchronizer(fr->pmedata), - useGpuFBufOps, useGpuFPmeReduction, accumulateForce); + if (useGpuFBufOps == BufferOpsUseGpu::True) { + nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::NonLocal, + nbv->getDeviceForces(), + pme_gpu_get_device_f(fr->pmedata), + pme_gpu_get_f_ready_synchronizer(fr->pmedata), + useGpuFPmeReduction, accumulateForce); nbv->launch_copy_f_from_gpu(f, Nbnxm::AtomLocality::NonLocal); } + else + { + nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal, + forceWithShiftForces.force()); + } + if (fr->nbv->emulateGpu() && forceFlags.computeVirial) { @@ -1574,15 +1582,22 @@ void do_force(FILE *fplog, // non-local part). bool accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) && (haveCpuForces || DOMAINDECOMP(cr)); - nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local, - force, pme_gpu_get_device_f(fr->pmedata), - pme_gpu_get_f_ready_synchronizer(fr->pmedata), - useGpuFBufOps, useGpuFPmeReduction, accumulateForce); + if (useGpuFBufOps == BufferOpsUseGpu::True) { + nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::Local, + nbv->getDeviceForces(), + pme_gpu_get_device_f(fr->pmedata), + pme_gpu_get_f_ready_synchronizer(fr->pmedata), + useGpuFPmeReduction, accumulateForce); nbv->launch_copy_f_from_gpu(f, Nbnxm::AtomLocality::Local); nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local); } + else + { + nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local, force); + } + } launchGpuEndOfStepTasks(nbv, fr->gpuBonded, fr->pmedata, enerd, diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp index 6424b1558e..0561462633 100644 --- a/src/gromacs/nbnxm/atomdata.cpp +++ b/src/gromacs/nbnxm/atomdata.cpp @@ -1472,16 +1472,10 @@ static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbnxn_atomdata_t *nbat, /* Add the force array(s) from nbnxn_atomdata_t to f */ -template void reduceForces(nbnxn_atomdata_t *nbat, const Nbnxm::AtomLocality locality, const Nbnxm::GridSet &gridSet, - rvec *f, - void *pmeFDeviceBuffer, - GpuEventSynchronizer *pmeForcesReady, - gmx_nbnxn_gpu_t *gpu_nbv, - bool useGpuFPmeReduction, - bool accumulateForce) + rvec *f) { int a0 = 0; int na = 0; @@ -1494,75 +1488,77 @@ void reduceForces(nbnxn_atomdata_t *nbat, return; } - if (useGpu) - { - Nbnxm::nbnxn_gpu_add_nbat_f_to_f(locality, - gpu_nbv, - pmeFDeviceBuffer, - pmeForcesReady, - a0, na, - useGpuFPmeReduction, - accumulateForce); - } - else - { - int nth = gmx_omp_nthreads_get(emntNonbonded); + int nth = gmx_omp_nthreads_get(emntNonbonded); - if (nbat->out.size() > 1) + if (nbat->out.size() > 1) + { + if (locality != Nbnxm::AtomLocality::All) { - if (locality != Nbnxm::AtomLocality::All) - { - gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll"); - } + gmx_incons("add_f_to_f called with nout>1 and locality!=eatAll"); + } - /* Reduce the force thread output buffers into buffer 0, before adding - * them to the, differently ordered, "real" force buffer. - */ - if (nbat->bUseTreeReduce) - { - nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat, nth); - } - else - { - nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat, nth); - } + /* Reduce the force thread output buffers into buffer 0, before adding + * them to the, differently ordered, "real" force buffer. + */ + if (nbat->bUseTreeReduce) + { + nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat, nth); } + else + { + nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat, nth); + } + } #pragma omp parallel for num_threads(nth) schedule(static) - for (int th = 0; th < nth; th++) + for (int th = 0; th < nth; th++) + { + try { - try - { - nbnxn_atomdata_add_nbat_f_to_f_part(gridSet, *nbat, - nbat->out[0], - a0 + ((th + 0)*na)/nth, - a0 + ((th + 1)*na)/nth, - f); - } - GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; + nbnxn_atomdata_add_nbat_f_to_f_part(gridSet, *nbat, + nbat->out[0], + a0 + ((th + 0)*na)/nth, + a0 + ((th + 1)*na)/nth, + f); } + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; + } +} + +/* Add the force array(s) from nbnxn_atomdata_t to f */ +void reduceForcesGpu(const Nbnxm::AtomLocality locality, + DeviceBuffer totalForcesDevice, + const Nbnxm::GridSet &gridSet, + void *pmeForcesDevice, + GpuEventSynchronizer *pmeForcesReady, + gmx_nbnxn_gpu_t *gpu_nbv, + bool useGpuFPmeReduction, + bool accumulateForce) +{ + int atomsStart = 0; + int numAtoms = 0; + + nbnxn_get_atom_range(locality, gridSet, &atomsStart, &numAtoms); + + if (numAtoms == 0) + { + /* The are no atoms for this reduction, avoid some overhead */ + return; } + + Nbnxm::nbnxn_gpu_add_nbat_f_to_f(locality, + totalForcesDevice, + gpu_nbv, + pmeForcesDevice, + pmeForcesReady, + atomsStart, numAtoms, + useGpuFPmeReduction, + accumulateForce); +} + +DeviceBuffer nbnxn_atomdata_get_f_gpu(gmx_nbnxn_gpu_t *gpu_nbv) +{ + return Nbnxm::nbnxn_gpu_get_f_gpu(gpu_nbv); } -template -void reduceForces(nbnxn_atomdata_t *nbat, - const Nbnxm::AtomLocality locality, - const Nbnxm::GridSet &gridSet, - rvec *f, - void *fpme, - GpuEventSynchronizer *pmeForcesReady, - gmx_nbnxn_gpu_t *gpu_nbv, - bool useGpuFPmeReduction, - bool accumulateForce); - -template -void reduceForces(nbnxn_atomdata_t *nbat, - const Nbnxm::AtomLocality locality, - const Nbnxm::GridSet &gridSet, - rvec *f, - void *fpme, - GpuEventSynchronizer *pmeForcesReady, - gmx_nbnxn_gpu_t *gpu_nbv, - bool useGpuFPmeReduction, - bool accumulateForce); void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t &nbat, gmx::ArrayRef fshift) diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h index 488290bb5b..ba644bd436 100644 --- a/src/gromacs/nbnxm/atomdata.h +++ b/src/gromacs/nbnxm/atomdata.h @@ -367,40 +367,47 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet &gridSet, gmx_nbnxn_gpu_t *gpu_nbv, DeviceBuffer coordinatesDevice); -//! Add the computed forces to \p f, an internal reduction might be performed as well -template +/*! \brief Add the computed forces to \p f, an internal reduction might be performed as well + * + * \param[in] nbat Atom data in NBNXM format. + * \param[in] locality If the reduction should be performed on local or non-local atoms. + * \param[in] gridSet The grids data. + * \param[out] totalForce Buffer to accumulate resulting force + */ void reduceForces(nbnxn_atomdata_t *nbat, Nbnxm::AtomLocality locality, const Nbnxm::GridSet &gridSet, - rvec *f, - void *pmeFDeviceBuffer, - GpuEventSynchronizer *pmeForcesReady, - gmx_nbnxn_gpu_t *gpu_nbv, - bool useGpuFPmeReduction, - bool accumulateForce); - - -extern template -void reduceForces(nbnxn_atomdata_t *nbat, - const Nbnxm::AtomLocality locality, - const Nbnxm::GridSet &gridSet, - rvec *f, - void *pmeFDeviceBuffer, - GpuEventSynchronizer *pmeForcesReady, - gmx_nbnxn_gpu_t *gpu_nbv, - bool useGpuFPmeReduction, - bool accumulateForce); - -extern template -void reduceForces(nbnxn_atomdata_t *nbat, - const Nbnxm::AtomLocality locality, - const Nbnxm::GridSet &gridSet, - rvec *f, - void *pmeFDeviceBuffer, - GpuEventSynchronizer *pmeForcesReady, - gmx_nbnxn_gpu_t *gpu_nbv, - bool useGpuFPmeReduction, - bool accumulateForce); + rvec *totalForce); + +/*! \brief Reduce forces on the GPU + * + * \param[in] locality If the reduction should be performed on local or non-local atoms. + * \param[out] totalForcesDevice Device buffer to accumulate resulting force. + * \param[in] gridSet The grids data. + * \param[in] pmeForcesDevice Device buffer with PME forces. + * \param[in] pmeForcesReady Event that signals when the PME forces are ready for the reduction. + * \param[in] gpu_nbv The NBNXM GPU data structure. + * \param[in] useGpuFPmeReduction Whether PME forces should be added. + * \param[in] accumulateForce Whether there are usefull data already in the total force buffer. + */ +void reduceForcesGpu(Nbnxm::AtomLocality locality, + DeviceBuffer totalForcesDevice, + const Nbnxm::GridSet &gridSet, + void *pmeForcesDevice, + GpuEventSynchronizer *pmeForcesReady, + gmx_nbnxn_gpu_t *gpu_nbv, + bool useGpuFPmeReduction, + bool accumulateForce); + +/*!\brief Getter for the GPU forces buffer + * + * \todo Will be removed when the buffer management is lifted out of the NBNXM + * + * \param[in] gpu_nbv The NBNXM GPU data structure. + * + * \returns Device forces buffer + */ +DeviceBuffer nbnxn_atomdata_get_f_gpu(gmx_nbnxn_gpu_t *gpu_nbv); /* Add the fshift force stored in nbat to fshift */ void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t &nbat, diff --git a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh index bc0d2c2c5e..3c8f7b1cb3 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh @@ -137,56 +137,59 @@ __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int numColu } -/*! \brief CUDA kernel to add part of the force array(s) from nbnxn_atomdata_t to f +/*! \brief CUDA kernel to sum up the force components * - * \param[in] fnb Force in nbat format - * \param[in] fPmeDeviceBuffer PME force - * \param[in,out] f Force buffer to be reduced into - * \param[in] cell Cell index mapping - * \param[in] atomStart Start atom index - * \param[in] nAtoms Number of Atoms + * \tparam accumulateForce If the initial forces in \p d_fTotal should be saved. + * \tparam addPmeForce Whether the PME force should be added to the total. + * + * \param[in] d_fNB Non-bonded forces in nbat format. + * \param[in] d_fPme PME forces. + * \param[in,out] d_fTotal Force buffer to be reduced into. + * \param[in] cell Cell index mapping. + * \param[in] atomStart Start atom index. + * \param[in] numAtoms Number of atoms. */ -template +template __global__ void -nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb, - const float3 *__restrict__ fPmeDeviceBuffer, - float3 * f, - const int *__restrict__ cell, - const int atomStart, - const int nAtoms); -template +nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ d_fNB, + const float3 *__restrict__ d_fPme, + float3 *d_fTotal, + const int *__restrict__ d_cell, + const int atomStart, + const int numAtoms); +template __global__ void -nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb, - const float3 *__restrict__ fPmeDeviceBuffer, - float3 * f, - const int *__restrict__ cell, - const int atomStart, - const int nAtoms) +nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ d_fNB, + const float3 *__restrict__ d_fPme, + float3 *d_fTotal, + const int *__restrict__ d_cell, + const int atomStart, + const int numAtoms) { /* map particle-level parallelism to 1D CUDA thread and block index */ int threadIndex = blockIdx.x*blockDim.x+threadIdx.x; /* perform addition for each particle*/ - if (threadIndex < nAtoms) + if (threadIndex < numAtoms) { - int i = cell[atomStart+threadIndex]; - float3 *fDest = (float3 *)&f[atomStart+threadIndex]; + int i = d_cell[atomStart+threadIndex]; + float3 *fDest = (float3 *)&d_fTotal[atomStart+threadIndex]; float3 temp; if (accumulateForce) { temp = *fDest; - temp += fnb[i]; + temp += d_fNB[i]; } else { - temp = fnb[i]; + temp = d_fNB[i]; } - if (addPmeF) + if (addPmeForce) { - temp += fPmeDeviceBuffer[atomStart+threadIndex]; + temp += d_fPme[atomStart+threadIndex]; } *fDest = temp; diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 63051b7c43..6a17ee6a9b 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -859,22 +859,22 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid, /* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality, + DeviceBuffer totalForcesDevice, gmx_nbnxn_gpu_t *nb, - void *fPmeDevicePtr, + void *pmeForcesDevice, GpuEventSynchronizer *pmeForcesReady, int atomStart, - int nAtoms, + int numAtoms, bool useGpuFPmeReduction, bool accumulateForce) { GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality); - cudaStream_t stream = nb->stream[iLocality]; - cu_atomdata_t *adat = nb->atdat; - bool addPmeF = useGpuFPmeReduction; + const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality); + cudaStream_t stream = nb->stream[iLocality]; + cu_atomdata_t *adat = nb->atdat; - if (addPmeF) + if (useGpuFPmeReduction) { //Stream must wait for PME force completion pmeForcesReady->enqueueWaitEvent(stream); @@ -886,7 +886,7 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality, config.blockSize[0] = c_bufOpsThreadsPerBlock; config.blockSize[1] = 1; config.blockSize[2] = 1; - config.gridSize[0] = ((nAtoms+1)+c_bufOpsThreadsPerBlock-1)/c_bufOpsThreadsPerBlock; + config.gridSize[0] = ((numAtoms+1)+c_bufOpsThreadsPerBlock-1)/c_bufOpsThreadsPerBlock; config.gridSize[1] = 1; config.gridSize[2] = 1; config.sharedMemorySize = 0; @@ -896,30 +896,35 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality, nbnxn_gpu_add_nbat_f_to_f_kernel : nbnxn_gpu_add_nbat_f_to_f_kernel; - if (addPmeF) + if (useGpuFPmeReduction) { kernelFn = accumulateForce ? nbnxn_gpu_add_nbat_f_to_f_kernel : nbnxn_gpu_add_nbat_f_to_f_kernel; } - const float3 *d_f = adat->f; - float3 *d_fNB = (float3*) nb->frvec; - const float3 *d_fPme = (float3*) fPmeDevicePtr; - const int *d_cell = nb->cell; + const float3 *d_fNB = adat->f; + const float3 *d_fPme = (float3*) pmeForcesDevice; + float3 *d_fTotal = (float3*) totalForcesDevice; + const int *d_cell = nb->cell; const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, - &d_f, - &d_fPme, &d_fNB, + &d_fPme, + &d_fTotal, &d_cell, &atomStart, - &nAtoms); + &numAtoms); launchGpuKernel(kernelFn, config, nullptr, "FbufferOps", kernelArgs); } +DeviceBuffer nbnxn_gpu_get_f_gpu(gmx_nbnxn_gpu_t *nb) +{ + return reinterpret_cast< DeviceBuffer >(nb->frvec); +} + void nbnxn_launch_copy_f_to_gpu(const AtomLocality atomLocality, const Nbnxm::GridSet &gridSet, gmx_nbnxn_gpu_t *nb, diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 794b184f36..68eac205a4 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -203,26 +203,22 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS); - reduceForces(nbat.get(), locality, pairSearch_->gridSet(), as_rvec_array(force.data()), nullptr, nullptr, gpu_nbv, false, false); + reduceForces(nbat.get(), locality, pairSearch_->gridSet(), as_rvec_array(force.data())); wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS); wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); } void -nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality locality, - gmx::ArrayRef force, - void *fPmeDeviceBuffer, - GpuEventSynchronizer *pmeForcesReady, - BufferOpsUseGpu useGpu, - bool useGpuFPmeReduction, - bool accumulateForce) +nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const Nbnxm::AtomLocality locality, + DeviceBuffer totalForcesDevice, + void *forcesPmeDevice, + GpuEventSynchronizer *pmeForcesReady, + bool useGpuFPmeReduction, + bool accumulateForce) { - GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) && accumulateForce), - "Accumulatation of force is only valid when GPU buffer ops are active"); - - GMX_ASSERT((useGpuFPmeReduction == (fPmeDeviceBuffer != nullptr)), + GMX_ASSERT((useGpuFPmeReduction == (forcesPmeDevice != nullptr)), "GPU PME force reduction is only valid when a non-null GPU PME force pointer is available"); /* Skip the reduction if there was no short-range GPU work to do @@ -235,8 +231,7 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS); - auto fn = useGpu == BufferOpsUseGpu::True ? reduceForces : reduceForces; - fn(nbat.get(), locality, pairSearch_->gridSet(), as_rvec_array(force.data()), fPmeDeviceBuffer, pmeForcesReady, gpu_nbv, useGpuFPmeReduction, accumulateForce); + reduceForcesGpu(locality, totalForcesDevice, pairSearch_->gridSet(), forcesPmeDevice, pmeForcesReady, gpu_nbv, useGpuFPmeReduction, accumulateForce); wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS); wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); @@ -259,6 +254,11 @@ nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu() wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); } +DeviceBuffer nonbonded_verlet_t::getDeviceForces() +{ + return nbnxn_atomdata_get_f_gpu(gpu_nbv); +} + real nonbonded_verlet_t::pairlistInnerRadius() const { return pairlistSets_->params().rlistInner; diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index 50ead8c655..db550859df 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -354,22 +354,30 @@ struct nonbonded_verlet_t void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality locality, gmx::ArrayRef force); - /*! \brief Add the forces stored in nbat to f, allowing for possibility that GPU buffer ops are active - * \param [in] locality Local or non-local - * \param [inout] force Force to be added to - * \param [in] fPme Force from PME calculation - * \param [in] pmeForcesReady Event triggered when PME force calculation has completed - * \param [in] useGpu Whether GPU buffer ops are active - * \param [in] useGpuFPmeReduction Whether PME force reduction is on GPU - * \param [in] accumulateForce Whether force should be accumulated or stored + /*! \brief Add the forces stored in nbat to total force using GPU buffer opse + * + * \param [in] locality Local or non-local + * \param [in,out] totalForcesDevice Force to be added to + * \param [in] forcesPmeDevice Device buffer with PME forces + * \param [in] pmeForcesReady Event triggered when PME force calculation has completed + * \param [in] useGpuFPmeReduction Whether PME forces should be added + * \param [in] accumulateForce If the total force buffer already contains data */ - void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality locality, - gmx::ArrayRef force, - void *fPme, - GpuEventSynchronizer *pmeForcesReady, - BufferOpsUseGpu useGpu, - bool useGpuFPmeReduction, - bool accumulateForce); + void atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality locality, + DeviceBuffer totalForcesDevice, + void *forcesPmeDevice, + GpuEventSynchronizer *pmeForcesReady, + bool useGpuFPmeReduction, + bool accumulateForce); + + /*!\brief Getter for the GPU force buffer. + * + * \todo This function will be removed in future patches as the management of the device buffers + * is moved to a separate object. + * + * \returns The force buffer in plain rvec format. + */ + DeviceBuffer getDeviceForces(); /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */ void atomdata_init_add_nbat_f_to_f_gpu(); diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index 4dfcfa15d4..8721e40da6 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -324,17 +324,44 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused *cell, gmx_nbnxn_gpu_t gmx_unused *gpu_nbv, int gmx_unused natoms_total) CUDA_FUNC_TERM; -/*! \brief F buffer operations on GPU: adds nb format force to rvec format. */ +/*! \brief Force buffer operations on GPU. + * + * Transforms non-bonded forces into plain rvec format and add all the force components to the total + * force buffer + * + * \param[in] atomLocality If the reduction should be performed on local or non-local atoms. + * \param[in] totalForcesDevice Device buffer to accumulate resulting force. + * \param[in] gpu_nbv The NBNXM GPU data structure. + * \param[in] pmeForcesDevice Device buffer with PME forces. + * \param[in] pmeForcesReady Event that signals when the PME forces are ready for the reduction. + * \param[in] atomStart Index of the first atom to reduce forces for. + * \param[in] numAtoms Number of atoms to reduce forces for. + * \param[in] useGpuFPmeReduction Whether PME forces should be added. + * \param[in] accumulateForce Whether there are usefull data already in the total force buffer. + * + */ CUDA_FUNC_QUALIFIER void nbnxn_gpu_add_nbat_f_to_f(AtomLocality gmx_unused atomLocality, + DeviceBuffer gmx_unused totalForcesDevice, gmx_nbnxn_gpu_t gmx_unused *gpu_nbv, - void gmx_unused *fPmeDevicePtr, + void gmx_unused *pmeForcesDevice, GpuEventSynchronizer gmx_unused *pmeForcesReady, int gmx_unused atomStart, - int gmx_unused nAtoms, + int gmx_unused numAtoms, bool gmx_unused useGpuFPmeReduction, bool gmx_unused accumulateForce) CUDA_FUNC_TERM; +/*! \brief Getter for the device coordinates buffer. + * + * \todo This will be removed as the management of the buffers is taken out of the NBNXM module. + * + * \param[in] gpu_nbv The nonbonded data GPU structure. + * + * \returns Device coordinates buffer in plain rvec format. + */ +CUDA_FUNC_QUALIFIER +DeviceBuffer nbnxn_gpu_get_f_gpu(gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer {}); + /*! \brief Copy force buffer from CPU to GPU */ CUDA_FUNC_QUALIFIER void nbnxn_launch_copy_f_to_gpu(AtomLocality gmx_unused atomLocality,