From 62b8797f78762ac66ac72e31dcb0f6b511537bb1 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Tue, 8 Oct 2019 03:47:46 +0200 Subject: [PATCH] Pass list of EventSynchronizers to GPU reduction Instead of passing events one-by-one or doing implicit synchronization hidden in a force tasks' internals, a list of synchronizer objects that signal the availability of forces from the source tasks is assembled and passed to the GPU buffer ops/reduction. The new dependency added here is the H2D transfer for CPU-side force contribution, PME-related event is kept, halo exchange dependency needs to be added. Change-Id: I5ca5ef919b49508790e813e0469aaeef4f6484c0 --- src/gromacs/mdlib/sim_util.cpp | 28 +++++++++++++++++++++++++-- src/gromacs/nbnxm/atomdata.cpp | 18 ++++++++--------- src/gromacs/nbnxm/atomdata.h | 18 ++++++++--------- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 29 ++++++++++++++++------------ src/gromacs/nbnxm/nbnxm.cpp | 14 +++++++------- src/gromacs/nbnxm/nbnxm.h | 14 +++++++------- src/gromacs/nbnxm/nbnxm_gpu.h | 20 +++++++++---------- 7 files changed, 85 insertions(+), 56 deletions(-) diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 0ea11d9146..331c48a640 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1493,6 +1493,14 @@ void do_force(FILE *fplog, if (useGpuFBufOps == BufferOpsUseGpu::True) { + std::vector dependencyList; + dependencyList.reserve(2); + + if (useGpuPmeFReduction) + { + dependencyList.push_back(pme_gpu_get_f_ready_synchronizer(fr->pmedata)); + } + // TODO: move this into DomainLifetimeWorkload, including the second part of the condition // The bonded and free energy CPU tasks can have non-local force contributions // which are a dependency for the GPU force reduction. @@ -1501,11 +1509,16 @@ void do_force(FILE *fplog, if (haveNonLocalForceContribInCpuBuffer) { stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal); + dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal)); } + + // + // FIXME: are we adding a PME->nonlocal dep here? + // nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::NonLocal, stateGpu->getForces(), pme_gpu_get_device_f(fr->pmedata), - pme_gpu_get_f_ready_synchronizer(fr->pmedata), + dependencyList, useGpuPmeFReduction, haveNonLocalForceContribInCpuBuffer); stateGpu->copyForcesFromGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal); } @@ -1626,6 +1639,14 @@ void do_force(FILE *fplog, * on the non-alternating path. */ if (bUseOrEmulGPU && !alternateGpuWait) { + std::vector dependencyList; + dependencyList.reserve(2); + + if (useGpuPmeFReduction) + { + dependencyList.push_back(pme_gpu_get_f_ready_synchronizer(fr->pmedata)); + } + gmx::ArrayRef forceWithShift = forceOut.forceWithShiftForces().force(); if (useGpuFBufOps == BufferOpsUseGpu::True) @@ -1646,6 +1667,7 @@ void do_force(FILE *fplog, if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange) { stateGpu->copyForcesToGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local); + dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local)); } if (useGpuForcesHaloExchange) { @@ -1653,13 +1675,15 @@ void do_force(FILE *fplog, // for the local buffer ops on the result of GPU halo // exchange, which operates in the non-local stream and // writes to to local parf og the force buffer. + // // TODO improve this through use of an event - see Redmine #3093 + // push the event into the dependencyList nbv->stream_local_wait_for_nonlocal(); } nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::Local, stateGpu->getForces(), pme_gpu_get_device_f(fr->pmedata), - pme_gpu_get_f_ready_synchronizer(fr->pmedata), + dependencyList, useGpuPmeFReduction, haveLocalForceContribInCpuBuffer); // This function call synchronizes the local stream nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local); diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp index a4e7b0285d..4a22cebe83 100644 --- a/src/gromacs/nbnxm/atomdata.cpp +++ b/src/gromacs/nbnxm/atomdata.cpp @@ -1492,14 +1492,14 @@ void reduceForces(nbnxn_atomdata_t *nbat, } /* Add the force array(s) from nbnxn_atomdata_t to f */ -void reduceForcesGpu(const Nbnxm::AtomLocality locality, - DeviceBuffer totalForcesDevice, - const Nbnxm::GridSet &gridSet, - void *pmeForcesDevice, - GpuEventSynchronizer *pmeForcesReady, - gmx_nbnxn_gpu_t *gpu_nbv, - bool useGpuFPmeReduction, - bool accumulateForce) +void reduceForcesGpu(const Nbnxm::AtomLocality locality, + DeviceBuffer totalForcesDevice, + const Nbnxm::GridSet &gridSet, + void *pmeForcesDevice, + gmx::ArrayRef dependencyList, + gmx_nbnxn_gpu_t *gpu_nbv, + bool useGpuFPmeReduction, + bool accumulateForce) { int atomsStart = 0; int numAtoms = 0; @@ -1516,7 +1516,7 @@ void reduceForcesGpu(const Nbnxm::AtomLocality locality, totalForcesDevice, gpu_nbv, pmeForcesDevice, - pmeForcesReady, + dependencyList, atomsStart, numAtoms, useGpuFPmeReduction, accumulateForce); diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h index 412d328bce..35d711beff 100644 --- a/src/gromacs/nbnxm/atomdata.h +++ b/src/gromacs/nbnxm/atomdata.h @@ -359,19 +359,19 @@ void reduceForces(nbnxn_atomdata_t *nbat, * \param[out] totalForcesDevice Device buffer to accumulate resulting force. * \param[in] gridSet The grids data. * \param[in] pmeForcesDevice Device buffer with PME forces. - * \param[in] pmeForcesReady Event that signals when the PME forces are ready for the reduction. + * \param[in] dependencyList List of synchronizers that represent the dependencies the reduction task needs to sync on. * \param[in] gpu_nbv The NBNXM GPU data structure. * \param[in] useGpuFPmeReduction Whether PME forces should be added. * \param[in] accumulateForce Whether there are usefull data already in the total force buffer. */ -void reduceForcesGpu(Nbnxm::AtomLocality locality, - DeviceBuffer totalForcesDevice, - const Nbnxm::GridSet &gridSet, - void *pmeForcesDevice, - GpuEventSynchronizer *pmeForcesReady, - gmx_nbnxn_gpu_t *gpu_nbv, - bool useGpuFPmeReduction, - bool accumulateForce); +void reduceForcesGpu(Nbnxm::AtomLocality locality, + DeviceBuffer totalForcesDevice, + const Nbnxm::GridSet &gridSet, + void *pmeForcesDevice, + gmx::ArrayRef dependencyList, + gmx_nbnxn_gpu_t *gpu_nbv, + bool useGpuFPmeReduction, + bool accumulateForce); /* Add the fshift force stored in nbat to fshift */ void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t &nbat, diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index ca88b5e94f..23502b3fbe 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -806,15 +806,15 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid, } /* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */ -void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality, - DeviceBuffer totalForcesDevice, - gmx_nbnxn_gpu_t *nb, - void *pmeForcesDevice, - GpuEventSynchronizer *pmeForcesReady, - int atomStart, - int numAtoms, - bool useGpuFPmeReduction, - bool accumulateForce) +void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality, + DeviceBuffer totalForcesDevice, + gmx_nbnxn_gpu_t *nb, + void *pmeForcesDevice, + gmx::ArrayRef dependencyList, + int atomStart, + int numAtoms, + bool useGpuFPmeReduction, + bool accumulateForce) { GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); GMX_ASSERT(numAtoms != 0, "Cannot call function with no atoms"); @@ -824,10 +824,15 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality, cudaStream_t stream = nb->stream[iLocality]; cu_atomdata_t *adat = nb->atdat; - if (useGpuFPmeReduction) + size_t gmx_used_in_debug numDependency = + static_cast((useGpuFPmeReduction == true)) + + static_cast((accumulateForce == true)); + GMX_ASSERT(numDependency >= dependencyList.size(), "Mismatching number of dependencies and call signature"); + + // Enqueue wait on all dependencies passed + for (auto const synchronizer : dependencyList) { - //Stream must wait for PME force completion - pmeForcesReady->enqueueWaitEvent(stream); + synchronizer->enqueueWaitEvent(stream); } /* launch kernel */ diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 88e6169b71..0ec882f3b7 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -189,12 +189,12 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality } void -nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const Nbnxm::AtomLocality locality, - DeviceBuffer totalForcesDevice, - void *forcesPmeDevice, - GpuEventSynchronizer *pmeForcesReady, - bool useGpuFPmeReduction, - bool accumulateForce) +nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const Nbnxm::AtomLocality locality, + DeviceBuffer totalForcesDevice, + void *forcesPmeDevice, + gmx::ArrayRef dependencyList, + bool useGpuFPmeReduction, + bool accumulateForce) { GMX_ASSERT((useGpuFPmeReduction == (forcesPmeDevice != nullptr)), @@ -210,7 +210,7 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const Nbnxm::AtomLocality wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS); - reduceForcesGpu(locality, totalForcesDevice, pairSearch_->gridSet(), forcesPmeDevice, pmeForcesReady, gpu_nbv, useGpuFPmeReduction, accumulateForce); + reduceForcesGpu(locality, totalForcesDevice, pairSearch_->gridSet(), forcesPmeDevice, dependencyList, gpu_nbv, useGpuFPmeReduction, accumulateForce); wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS); wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index 3be931f7d9..2f25f701cb 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -346,16 +346,16 @@ struct nonbonded_verlet_t * \param [in] locality Local or non-local * \param [in,out] totalForcesDevice Force to be added to * \param [in] forcesPmeDevice Device buffer with PME forces - * \param [in] pmeForcesReady Event triggered when PME force calculation has completed + * \param[in] dependencyList List of synchronizers that represent the dependencies the reduction task needs to sync on. * \param [in] useGpuFPmeReduction Whether PME forces should be added * \param [in] accumulateForce If the total force buffer already contains data */ - void atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality locality, - DeviceBuffer totalForcesDevice, - void *forcesPmeDevice, - GpuEventSynchronizer *pmeForcesReady, - bool useGpuFPmeReduction, - bool accumulateForce); + void atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality locality, + DeviceBuffer totalForcesDevice, + void *forcesPmeDevice, + gmx::ArrayRef dependencyList, + bool useGpuFPmeReduction, + bool accumulateForce); /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */ void atomdata_init_add_nbat_f_to_f_gpu(); diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index ae396e9089..ccb2aa5bde 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -301,7 +301,7 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused *cell, * \param[in] totalForcesDevice Device buffer to accumulate resulting force. * \param[in] gpu_nbv The NBNXM GPU data structure. * \param[in] pmeForcesDevice Device buffer with PME forces. - * \param[in] pmeForcesReady Event that signals when the PME forces are ready for the reduction. + * \param[in] dependencyList List of synchronizers that represent the dependencies the reduction task needs to sync on. * \param[in] atomStart Index of the first atom to reduce forces for. * \param[in] numAtoms Number of atoms to reduce forces for. * \param[in] useGpuFPmeReduction Whether PME forces should be added. @@ -309,15 +309,15 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused *cell, * */ CUDA_FUNC_QUALIFIER -void nbnxn_gpu_add_nbat_f_to_f(AtomLocality gmx_unused atomLocality, - DeviceBuffer gmx_unused totalForcesDevice, - gmx_nbnxn_gpu_t gmx_unused *gpu_nbv, - void gmx_unused *pmeForcesDevice, - GpuEventSynchronizer gmx_unused *pmeForcesReady, - int gmx_unused atomStart, - int gmx_unused numAtoms, - bool gmx_unused useGpuFPmeReduction, - bool gmx_unused accumulateForce) CUDA_FUNC_TERM; +void nbnxn_gpu_add_nbat_f_to_f(AtomLocality gmx_unused atomLocality, + DeviceBuffer gmx_unused totalForcesDevice, + gmx_nbnxn_gpu_t gmx_unused *gpu_nbv, + void gmx_unused *pmeForcesDevice, + gmx::ArrayRef gmx_unused dependencyList, + int gmx_unused atomStart, + int gmx_unused numAtoms, + bool gmx_unused useGpuFPmeReduction, + bool gmx_unused accumulateForce) CUDA_FUNC_TERM; /*! \brief Wait for GPU stream to complete */ CUDA_FUNC_QUALIFIER -- 2.22.0