From 0a4ca2c479237e915d96bcff8500c282a6ec3050 Mon Sep 17 00:00:00 2001 From: Alan Gray Date: Fri, 10 May 2019 03:28:58 -0700 Subject: [PATCH] PME reduction for CUDA F buffer operations Enable with GMX_USE_GPU_BUFFER_OPS env variable. Provides functionality to perform reduction of PME forces in F buffer ops kernel. Currently active when single GPU performs both PME and PP (multi-GPU support will follow in patch which perfoms PME/PP comms direct between GPUs). When active, Device->Host copy of PME force and CPU-side reduction is disabled. Implements part of #3029, refs #2817 Change-Id: I3e66b6919c1e86bf0bed42b74136f8694626910b --- src/gromacs/ewald/pme.h | 27 ++++++- src/gromacs/ewald/pme_gpu.cpp | 41 +++++++++-- src/gromacs/ewald/pme_gpu_internal.cpp | 36 +++++++++- src/gromacs/ewald/pme_gpu_internal.h | 21 +++++- src/gromacs/ewald/pme_gpu_types_host_impl.h | 2 + src/gromacs/ewald/pme_only.cpp | 2 +- src/gromacs/ewald/tests/pmetestcommon.cpp | 2 +- src/gromacs/mdlib/sim_util.cpp | 71 +++++++++++-------- src/gromacs/nbnxm/atomdata.cpp | 29 +++++--- src/gromacs/nbnxm/atomdata.h | 19 +++-- .../nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh | 35 +++++---- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 49 +++++++++---- src/gromacs/nbnxm/nbnxm.cpp | 34 +++++++-- src/gromacs/nbnxm/nbnxm.h | 31 +++++--- src/gromacs/nbnxm/nbnxm_gpu.h | 6 +- 15 files changed, 300 insertions(+), 105 deletions(-) diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h index 29cc106554..a660390135 100644 --- a/src/gromacs/ewald/pme.h +++ b/src/gromacs/ewald/pme.h @@ -73,6 +73,7 @@ struct NumPmeDomains; enum class GpuTaskCompletion; class PmeGpuProgram; +class GpuEventSynchronizer; //! Convenience name. using PmeGpuProgramHandle = const PmeGpuProgram *; @@ -382,10 +383,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_launch_complex_transforms(gmx_pme_t *GPU_F * \param[in] forceTreatment Tells how data should be treated. The gathering kernel either stores * the output reciprocal forces into the host array, or copies its contents to the GPU first * and accumulates. The reduction is non-atomic. + * \param[in] useGpuFPmeReduction Whether PME forces are reduced on GPU */ GPU_FUNC_QUALIFIER void pme_gpu_launch_gather(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme), gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle), - PmeForceOutputHandling GPU_FUNC_ARGUMENT(forceTreatment)) GPU_FUNC_TERM; + PmeForceOutputHandling GPU_FUNC_ARGUMENT(forceTreatment), + bool GPU_FUNC_ARGUMENT(useGpuFPmeReduction)) GPU_FUNC_TERM; /*! \brief * Attempts to complete PME GPU tasks. @@ -426,13 +429,15 @@ GPU_FUNC_QUALIFIER bool * \param[in] wcycle The wallclock counter. * \param[out] forceWithVirial The output force and virial * \param[out] enerd The output energies + * \param[in] useGpuFPmeReduction Whether PME forces are reduced on GPU */ GPU_FUNC_QUALIFIER void pme_gpu_wait_and_reduce(gmx_pme_t *GPU_FUNC_ARGUMENT(pme), int GPU_FUNC_ARGUMENT(flags), gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle), gmx::ForceWithVirial *GPU_FUNC_ARGUMENT(forceWithVirial), - gmx_enerdata_t *GPU_FUNC_ARGUMENT(enerd)) GPU_FUNC_TERM; + gmx_enerdata_t *GPU_FUNC_ARGUMENT(enerd), + bool GPU_FUNC_ARGUMENT(useGpuFPmeReduction)) GPU_FUNC_TERM; /*! \brief * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing. @@ -451,6 +456,22 @@ GPU_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t *GPU_FUNC_ARG gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM; -/*! \brief Get pointer to device copy of coordinate data. */ +/*! \brief Get pointer to device copy of coordinate data. + * \param[in] pme The PME data structure. + * \returns Pointer to coordinate data + */ GPU_FUNC_QUALIFIER void *pme_gpu_get_device_x(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr); + +/*! \brief Get pointer to device copy of force data. + * \param[in] pme The PME data structure. + * \returns Pointer to force data + */ +GPU_FUNC_QUALIFIER void *pme_gpu_get_device_f(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr); + +/*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion + * \param[in] pme The PME data structure. + * \returns Pointer to sychronizer + */ +GPU_FUNC_QUALIFIER GpuEventSynchronizer *pme_gpu_get_f_ready_synchronizer(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr); + #endif diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp index 07d15cdf1a..cd1ba956c4 100644 --- a/src/gromacs/ewald/pme_gpu.cpp +++ b/src/gromacs/ewald/pme_gpu.cpp @@ -249,7 +249,8 @@ void pme_gpu_launch_complex_transforms(gmx_pme_t *pme, void pme_gpu_launch_gather(const gmx_pme_t *pme, gmx_wallcycle gmx_unused *wcycle, - PmeForceOutputHandling forceTreatment) + PmeForceOutputHandling forceTreatment, + bool useGpuFPmeReduction) { GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); @@ -262,7 +263,7 @@ void pme_gpu_launch_gather(const gmx_pme_t *pme, wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); const unsigned int gridIndex = 0; real *fftgrid = pme->fftgrid[gridIndex]; - pme_gpu_gather(pme->gpu, forceTreatment, reinterpret_cast(fftgrid)); + pme_gpu_gather(pme->gpu, forceTreatment, reinterpret_cast(fftgrid), useGpuFPmeReduction); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); wallcycle_stop(wcycle, ewcLAUNCH_GPU); } @@ -286,7 +287,8 @@ static void pme_gpu_reduce_outputs(const int flags, const PmeOutput &output, gmx_wallcycle *wcycle, gmx::ForceWithVirial *forceWithVirial, - gmx_enerdata_t *enerd) + gmx_enerdata_t *enerd, + bool useGpuFPmeReduction) { wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION); GMX_ASSERT(forceWithVirial, "Invalid force pointer"); @@ -297,7 +299,10 @@ static void pme_gpu_reduce_outputs(const int flags, forceWithVirial->addVirialContribution(output.coulombVirial_); enerd->term[F_COUL_RECIP] += output.coulombEnergy_; } - sum_forces(forceWithVirial->force_, output.forces_); + if (!useGpuFPmeReduction) + { + sum_forces(forceWithVirial->force_, output.forces_); + } wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION); } @@ -344,7 +349,7 @@ bool pme_gpu_try_finish_task(gmx_pme_t *pme, PmeOutput output = pme_gpu_getOutput(*pme, flags); wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER); - pme_gpu_reduce_outputs(flags, output, wcycle, forceWithVirial, enerd); + pme_gpu_reduce_outputs(flags, output, wcycle, forceWithVirial, enerd, false); return true; } @@ -357,7 +362,9 @@ PmeOutput pme_gpu_wait_finish_task(gmx_pme_t *pme, GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER); + // Synchronize the whole PME stream at once, including D2H result transfers. + // TODO: make this sync conditional with useGpuFPmeReduction to wait only for virial/energies pme_gpu_synchronize(pme->gpu); pme_gpu_update_timings(pme->gpu); @@ -371,10 +378,11 @@ void pme_gpu_wait_and_reduce(gmx_pme_t *pme, const int flags, gmx_wallcycle *wcycle, gmx::ForceWithVirial *forceWithVirial, - gmx_enerdata_t *enerd) + gmx_enerdata_t *enerd, + bool useGpuFPmeReduction) { PmeOutput output = pme_gpu_wait_finish_task(pme, flags, wcycle); - pme_gpu_reduce_outputs(flags, output, wcycle, forceWithVirial, enerd); + pme_gpu_reduce_outputs(flags, output, wcycle, forceWithVirial, enerd, useGpuFPmeReduction); } void pme_gpu_reinit_computation(const gmx_pme_t *pme, @@ -400,3 +408,22 @@ void *pme_gpu_get_device_x(const gmx_pme_t *pme) } return pme_gpu_get_kernelparam_coordinates(pme->gpu); } + +void *pme_gpu_get_device_f(const gmx_pme_t *pme) +{ + if (!pme || !pme_gpu_active(pme)) + { + return nullptr; + } + return pme_gpu_get_kernelparam_forces(pme->gpu); +} + +GpuEventSynchronizer * pme_gpu_get_f_ready_synchronizer(const gmx_pme_t *pme) +{ + if (!pme || !pme_gpu_active(pme)) + { + return nullptr; + } + + return pme_gpu_get_forces_ready_synchronizer(pme->gpu); +} diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index cad028b557..70c74c900c 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -239,7 +239,6 @@ void pme_gpu_copy_input_coordinates(const PmeGpu *pmeGpu, const rvec *h_coordina // FIXME: sync required since the copied data will be used by PP stream when using single GPU for both // Remove after adding the required event-based sync between the above H2D and the transform kernel pme_gpu_synchronize(pmeGpu); - #endif } @@ -1207,7 +1206,8 @@ void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid, void pme_gpu_gather(PmeGpu *pmeGpu, PmeForceOutputHandling forceTreatment, - const float *h_grid + const float *h_grid, + bool useGpuFPmeReduction ) { /* Copying the input CPU forces for reduction */ @@ -1269,7 +1269,14 @@ void pme_gpu_gather(PmeGpu *pmeGpu, launchGpuKernel(kernelPtr, config, timingEvent, "PME gather", kernelArgs); pme_gpu_stop_timing(pmeGpu, timingId); - pme_gpu_copy_output_forces(pmeGpu); + if (useGpuFPmeReduction) + { + pmeGpu->archSpecific->pmeForcesReady.markEvent(pmeGpu->archSpecific->pmeStream); + } + else + { + pme_gpu_copy_output_forces(pmeGpu); + } } void * pme_gpu_get_kernelparam_coordinates(const PmeGpu *pmeGpu) @@ -1282,5 +1289,28 @@ void * pme_gpu_get_kernelparam_coordinates(const PmeGpu *pmeGpu) { return nullptr; } +} +void * pme_gpu_get_kernelparam_forces(const PmeGpu *pmeGpu) +{ + if (pmeGpu && pmeGpu->kernelParams) + { + return pmeGpu->kernelParams->atoms.d_forces; + } + else + { + return nullptr; + } +} + +GpuEventSynchronizer *pme_gpu_get_forces_ready_synchronizer(const PmeGpu *pmeGpu) +{ + if (pmeGpu && pmeGpu->kernelParams) + { + return &pmeGpu->archSpecific->pmeForcesReady; + } + else + { + return nullptr; + } } diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h index e22fe64592..19d243af41 100644 --- a/src/gromacs/ewald/pme_gpu_internal.h +++ b/src/gromacs/ewald/pme_gpu_internal.h @@ -437,14 +437,31 @@ GPU_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu) * \param[in] forceTreatment Tells how data in h_forces should be treated. * TODO: determine efficiency/balance of host/device-side reductions. * \param[in] h_grid The host-side grid buffer (used only in testing mode) + * \param[in] useGpuFPmeReduction Whether forces are reduced on GPU */ GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu), PmeForceOutputHandling GPU_FUNC_ARGUMENT(forceTreatment), - const float *GPU_FUNC_ARGUMENT(h_grid)) GPU_FUNC_TERM; + const float *GPU_FUNC_ARGUMENT(h_grid), + bool GPU_FUNC_ARGUMENT(useGpuFPmeReduction)) GPU_FUNC_TERM; -/*! \brief Return pointer to device copy of coordinate data. */ +/*! \brief Return pointer to device copy of coordinate data. + * \param[in] pmeGpu The PME GPU structure. + * \returns Pointer to coordinate data + */ GPU_FUNC_QUALIFIER void * pme_gpu_get_kernelparam_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr); +/*! \brief Return pointer to device copy of force data. + * \param[in] pmeGpu The PME GPU structure. + * \returns Pointer to force data + */ +GPU_FUNC_QUALIFIER void * pme_gpu_get_kernelparam_forces(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr); + +/*! \brief Return pointer to the sync object triggered after the PME force calculation completion + * \param[in] pmeGpu The PME GPU structure. + * \returns Pointer to sync object + */ +GPU_FUNC_QUALIFIER GpuEventSynchronizer *pme_gpu_get_forces_ready_synchronizer(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr); + /* The inlined convenience PME GPU status getters */ /*! \libinternal \brief diff --git a/src/gromacs/ewald/pme_gpu_types_host_impl.h b/src/gromacs/ewald/pme_gpu_types_host_impl.h index 6949bde60f..be865c8778 100644 --- a/src/gromacs/ewald/pme_gpu_types_host_impl.h +++ b/src/gromacs/ewald/pme_gpu_types_host_impl.h @@ -79,6 +79,8 @@ struct PmeGpuSpecific Context context; /* Synchronization events */ + /*! \brief Triggered after the PME Force Calculations have been completed */ + GpuEventSynchronizer pmeForcesReady; /*! \brief Triggered after the grid has been copied to the host (after the spreading stage). */ GpuEventSynchronizer syncSpreadGridD2H; diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp index 6f8ebe1723..e3e948a856 100644 --- a/src/gromacs/ewald/pme_only.cpp +++ b/src/gromacs/ewald/pme_only.cpp @@ -626,7 +626,7 @@ int gmx_pmeonly(struct gmx_pme_t *pme, pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags); pme_gpu_launch_spread(pme, as_rvec_array(pme_pp->x.data()), wcycle); pme_gpu_launch_complex_transforms(pme, wcycle); - pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set); + pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set, false); output = pme_gpu_wait_finish_task(pme, pmeFlags, wcycle); pme_gpu_reinit_computation(pme, wcycle); } diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp index 55515294d2..763d79d682 100644 --- a/src/gromacs/ewald/tests/pmetestcommon.cpp +++ b/src/gromacs/ewald/tests/pmetestcommon.cpp @@ -418,7 +418,7 @@ void pmePerformGather(gmx_pme_t *pme, CodePath mode, { std::copy(std::begin(forces), std::end(forces), std::begin(output.forces_)); } - pme_gpu_gather(pme->gpu, inputTreatment, reinterpret_cast(fftgrid)); + pme_gpu_gather(pme->gpu, inputTreatment, reinterpret_cast(fftgrid), false); std::copy(std::begin(output.forces_), std::end(output.forces_), std::begin(forces)); } break; diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 11963c0de6..0c53b40c4d 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1,4 +1,4 @@ -/* +/* x * This file is part of the GROMACS molecular simulation package. * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. @@ -623,12 +623,14 @@ static inline void launchPmeGpuSpread(gmx_pme_t *pmedata, * * \param[in] pmedata The PME structure * \param[in] wcycle The wallcycle structure + * \param[in] useGpuFPmeReduction Whether forces will be reduced on GPU */ static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata, - gmx_wallcycle_t wcycle) + gmx_wallcycle_t wcycle, + bool useGpuFPmeReduction) { pme_gpu_launch_complex_transforms(pmedata, wcycle); - pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set); + pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set, useGpuFPmeReduction); } /*! \brief @@ -695,9 +697,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L); nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local, - as_rvec_array(force->unpaddedArrayRef().data()), - BufferOpsUseGpu::False, - GpuBufferOpsAccumulateForce::Null); + as_rvec_array(force->unpaddedArrayRef().data())); } } } @@ -907,12 +907,18 @@ void do_force(FILE *fplog, ((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) | ((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) | ((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0); - const BufferOpsUseGpu useGpuFBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA)) - && !(flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) ? - BufferOpsUseGpu::True : BufferOpsUseGpu::False; + // Switches on whether to use GPU for position and force buffer operations + // TODO consider all possible combinations of triggers, and how to combine optimally in each case. const BufferOpsUseGpu useGpuXBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA)) ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;; + // GPU Force buffer ops are disabled on virial steps, because the virial calc is not yet ported to GPU + const BufferOpsUseGpu useGpuFBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA)) + && !(flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) ? + BufferOpsUseGpu::True : BufferOpsUseGpu::False; + // TODO: move / add this flag to the internal PME GPU data structures + const bool useGpuFPmeReduction = (useGpuFBufOps == BufferOpsUseGpu::True) && + thisRankHasDuty(cr, DUTY_PME) && useGpuPme; // only supported if this rank is perfoming PME on the GPU /* At a search step we need to start the first balancing region * somewhere early inside the step after communication during domain @@ -1141,7 +1147,7 @@ void do_force(FILE *fplog, // X copy/transform to allow overlap as well as after the GPU NB // launch to avoid FFT launch overhead hijacking the CPU and delaying // the nonbonded kernel. - launchPmeGpuFftAndGather(fr->pmedata, wcycle); + launchPmeGpuFftAndGather(fr->pmedata, wcycle, useGpuFPmeReduction); } /* Communicate coordinates and sum dipole if necessary + @@ -1327,9 +1333,7 @@ void do_force(FILE *fplog, * communication with calculation with domain decomposition. */ wallcycle_stop(wcycle, ewcFORCE); - nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f(), - BufferOpsUseGpu::False, - GpuBufferOpsAccumulateForce::Null); + nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f()); wallcycle_start_nocount(wcycle, ewcFORCE); @@ -1365,20 +1369,8 @@ void do_force(FILE *fplog, flags, &forceOut.forceWithVirial(), enerd, ed, bNS); - // flag to specify if CPU force output is preset in force - // buffer. For now, this is true even when useGpuPme == true - // (because on-GPU PME-nonbonded reduction will be added in - // follow-up) - // TODO adapt the below when on-GPU PME-nonbonded reduction is available. - bool useCpuPmeReduction = true; - bool haveCpuForces = (ppForceWorkload->haveSpecialForces || ppForceWorkload->haveCpuListedForceWork || useCpuPmeReduction); - // flag to specify if forces should be accumulated in force buffer - // ops. For now, this is solely determined by above haveCpuForces - // flag, but in future developments it will also depend on - // e.g. whether the GPU force halo exchange is active. - GpuBufferOpsAccumulateForce accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) && - haveCpuForces ? GpuBufferOpsAccumulateForce::True : - GpuBufferOpsAccumulateForce::False; + bool useCpuFPmeReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuFPmeReduction; + bool haveCpuForces = (ppForceWorkload->haveSpecialForces || ppForceWorkload->haveCpuListedForceWork || useCpuFPmeReduction); // Will store the amount of cycles spent waiting for the GPU that // will be later used in the DLB accounting. @@ -1410,8 +1402,15 @@ void do_force(FILE *fplog, { nbv->launch_copy_f_to_gpu(forceOut.f(), Nbnxm::AtomLocality::NonLocal); } + + // flag to specify if forces should be accumulated in force buffer + // ops. For non-local part, this just depends on whether CPU forces are present. + bool accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) && haveCpuForces; nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal, - forceOut.f(), useGpuFBufOps, accumulateForce); + forceOut.f(), pme_gpu_get_device_f(fr->pmedata), + pme_gpu_get_f_ready_synchronizer(fr->pmedata), + useGpuFBufOps, useGpuFPmeReduction, accumulateForce); + if (useGpuFBufOps == BufferOpsUseGpu::True) { nbv->launch_copy_f_from_gpu(forceOut.f(), Nbnxm::AtomLocality::NonLocal); @@ -1456,7 +1455,7 @@ void do_force(FILE *fplog, if (!alternateGpuWait && useGpuPme) { - pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceOut.forceWithVirial(), enerd); + pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceOut.forceWithVirial(), enerd, useGpuFPmeReduction); } /* Wait for local GPU NB outputs on the non-alternating wait path */ @@ -1515,12 +1514,22 @@ void do_force(FILE *fplog, // - wait for force reduction does not need to block host (at least not here, it's sufficient to wait // before the next CPU task that consumes the forces: vsite spread or update) // - if (useGpuFBufOps == BufferOpsUseGpu::True && haveCpuForces) + if (useGpuFBufOps == BufferOpsUseGpu::True && (haveCpuForces || DOMAINDECOMP(cr))) { nbv->launch_copy_f_to_gpu(forceOut.f(), Nbnxm::AtomLocality::Local); } + // flag to specify if forces should be accumulated in force + // buffer ops. For local part, this depends on whether CPU + // forces are present, or if DD is active (in which case the + // halo exchange has resulted in contributions from the + // non-local part). + bool accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) && + (haveCpuForces || DOMAINDECOMP(cr)); nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local, - forceOut.f(), useGpuFBufOps, accumulateForce); + forceOut.f(), pme_gpu_get_device_f(fr->pmedata), + pme_gpu_get_f_ready_synchronizer(fr->pmedata), + useGpuFBufOps, useGpuFPmeReduction, accumulateForce); + if (useGpuFBufOps == BufferOpsUseGpu::True) { nbv->launch_copy_f_from_gpu(forceOut.f(), Nbnxm::AtomLocality::Local); diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp index 8288a1d227..dd0ccb2ce7 100644 --- a/src/gromacs/nbnxm/atomdata.cpp +++ b/src/gromacs/nbnxm/atomdata.cpp @@ -1441,12 +1441,15 @@ static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbnxn_atomdata_t *nbat, /* Add the force array(s) from nbnxn_atomdata_t to f */ template -void reduceForces(nbnxn_atomdata_t *nbat, - const Nbnxm::AtomLocality locality, - const Nbnxm::GridSet &gridSet, - rvec *f, - gmx_nbnxn_gpu_t *gpu_nbv, - GpuBufferOpsAccumulateForce accumulateForce) +void reduceForces(nbnxn_atomdata_t *nbat, + const Nbnxm::AtomLocality locality, + const Nbnxm::GridSet &gridSet, + rvec *f, + void *pmeFDeviceBuffer, + GpuEventSynchronizer *pmeForcesReady, + gmx_nbnxn_gpu_t *gpu_nbv, + bool useGpuFPmeReduction, + bool accumulateForce) { int a0 = 0; int na = 0; @@ -1463,9 +1466,11 @@ void reduceForces(nbnxn_atomdata_t *nbat, { Nbnxm::nbnxn_gpu_add_nbat_f_to_f(locality, gpu_nbv, + pmeFDeviceBuffer, + pmeForcesReady, a0, na, + useGpuFPmeReduction, accumulateForce); - } else { @@ -1510,16 +1515,22 @@ void reduceForces(nbnxn_atomdata_t *nbat, const Nbnxm::AtomLocality locality, const Nbnxm::GridSet &gridSet, rvec *f, + void *fpme, + GpuEventSynchronizer *pmeForcesReady, gmx_nbnxn_gpu_t *gpu_nbv, - GpuBufferOpsAccumulateForce accumulateForce); + bool useGpuFPmeReduction, + bool accumulateForce); template void reduceForces(nbnxn_atomdata_t *nbat, const Nbnxm::AtomLocality locality, const Nbnxm::GridSet &gridSet, rvec *f, + void *fpme, + GpuEventSynchronizer *pmeForcesReady, gmx_nbnxn_gpu_t *gpu_nbv, - GpuBufferOpsAccumulateForce accumulateForce); + bool useGpuFPmeReduction, + bool accumulateForce); void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat, rvec *fshift) diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h index d627732014..e34c8bd558 100644 --- a/src/gromacs/nbnxm/atomdata.h +++ b/src/gromacs/nbnxm/atomdata.h @@ -59,14 +59,14 @@ struct tMPI_Atomic; enum class BufferOpsUseGpu; +class GpuEventSynchronizer; + namespace Nbnxm { class GridSet; enum class KernelType; } -enum class GpuBufferOpsAccumulateForce; - /* Convenience type for vector with aligned memory */ template using AlignedVector = std::vector < T, gmx::AlignedAllocator < T>>; @@ -344,8 +344,11 @@ void reduceForces(nbnxn_atomdata_t *nbat, Nbnxm::AtomLocality locality, const Nbnxm::GridSet &gridSet, rvec *f, + void *pmeFDeviceBuffer, + GpuEventSynchronizer *pmeForcesReady, gmx_nbnxn_gpu_t *gpu_nbv, - GpuBufferOpsAccumulateForce accumulateForce); + bool useGpuFPmeReduction, + bool accumulateForce); extern template @@ -353,16 +356,22 @@ void reduceForces(nbnxn_atomdata_t *nbat, const Nbnxm::AtomLocality locality, const Nbnxm::GridSet &gridSet, rvec *f, + void *pmeFDeviceBuffer, + GpuEventSynchronizer *pmeForcesReady, gmx_nbnxn_gpu_t *gpu_nbv, - GpuBufferOpsAccumulateForce accumulateForce); + bool useGpuFPmeReduction, + bool accumulateForce); extern template void reduceForces(nbnxn_atomdata_t *nbat, const Nbnxm::AtomLocality locality, const Nbnxm::GridSet &gridSet, rvec *f, + void *pmeFDeviceBuffer, + GpuEventSynchronizer *pmeForcesReady, gmx_nbnxn_gpu_t *gpu_nbv, - GpuBufferOpsAccumulateForce accumulateForce); + bool useGpuFPmeReduction, + bool accumulateForce); /* Add the fshift force stored in nbat to fshift */ void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat, diff --git a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh index 09d6e37243..bc0d2c2c5e 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_buffer_ops_kernels.cuh @@ -139,24 +139,26 @@ __global__ void nbnxn_gpu_x_to_nbat_x_kernel(int numColu /*! \brief CUDA kernel to add part of the force array(s) from nbnxn_atomdata_t to f * - * \param[in] fnb Force in nbat format - * \param[in,out] f Force buffer to be reduced into - * \param[in] cell Cell index mapping - * \param[in] a0 start atom index - * \param[in] a1 end atom index - * \param[in] stride stride between atoms in memory + * \param[in] fnb Force in nbat format + * \param[in] fPmeDeviceBuffer PME force + * \param[in,out] f Force buffer to be reduced into + * \param[in] cell Cell index mapping + * \param[in] atomStart Start atom index + * \param[in] nAtoms Number of Atoms */ -template +template __global__ void nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb, - rvec * f, + const float3 *__restrict__ fPmeDeviceBuffer, + float3 * f, const int *__restrict__ cell, const int atomStart, const int nAtoms); -template +template __global__ void nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb, - rvec * f, + const float3 *__restrict__ fPmeDeviceBuffer, + float3 * f, const int *__restrict__ cell, const int atomStart, const int nAtoms) @@ -170,16 +172,23 @@ nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb, { int i = cell[atomStart+threadIndex]; - float3 *f_dest = (float3 *)&f[atomStart+threadIndex][XX]; + float3 *fDest = (float3 *)&f[atomStart+threadIndex]; + float3 temp; if (accumulateForce) { - *f_dest += fnb[i]; + temp = *fDest; + temp += fnb[i]; } else { - *f_dest = fnb[i]; + temp = fnb[i]; } + if (addPmeF) + { + temp += fPmeDeviceBuffer[atomStart+threadIndex]; + } + *fDest = temp; } return; diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index ec16b1b01c..74925428ac 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -54,6 +54,7 @@ #include "nbnxm_cuda.h" #include "gromacs/gpu_utils/cudautils.cuh" +#include "gromacs/gpu_utils/gpueventsynchronizer.cuh" #include "gromacs/gpu_utils/vectype_ops.cuh" #include "gromacs/mdlib/force_flags.h" #include "gromacs/nbnxm/atomdata.h" @@ -851,18 +852,27 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid &grid, } /* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */ -void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality, - gmx_nbnxn_gpu_t *nb, - int atomStart, - int nAtoms, - GpuBufferOpsAccumulateForce accumulateForce) +void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality, + gmx_nbnxn_gpu_t *nb, + void *fPmeDevicePtr, + GpuEventSynchronizer *pmeForcesReady, + int atomStart, + int nAtoms, + bool useGpuFPmeReduction, + bool accumulateForce) { GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality); cudaStream_t stream = nb->stream[iLocality]; + cu_atomdata_t *adat = nb->atdat; + bool addPmeF = useGpuFPmeReduction; - cu_atomdata_t *adat = nb->atdat; + if (addPmeF) + { + //Stream must wait for PME force completion + pmeForcesReady->enqueueWaitEvent(stream); + } /* launch kernel */ @@ -876,16 +886,27 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality, config.sharedMemorySize = 0; config.stream = stream; - auto kernelFn = (accumulateForce == GpuBufferOpsAccumulateForce::True) ? - nbnxn_gpu_add_nbat_f_to_f_kernel : nbnxn_gpu_add_nbat_f_to_f_kernel; - const float3 *fPtr = adat->f; - rvec *frvec = nb->frvec; - const int *cell = nb->cell; + auto kernelFn = accumulateForce ? + nbnxn_gpu_add_nbat_f_to_f_kernel : + nbnxn_gpu_add_nbat_f_to_f_kernel; + + if (addPmeF) + { + kernelFn = accumulateForce ? + nbnxn_gpu_add_nbat_f_to_f_kernel : + nbnxn_gpu_add_nbat_f_to_f_kernel; + } + + const float3 *d_f = adat->f; + float3 *d_fNB = (float3*) nb->frvec; + const float3 *d_fPme = (float3*) fPmeDevicePtr; + const int *d_cell = nb->cell; const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, - &fPtr, - &frvec, - &cell, + &d_f, + &d_fPme, + &d_fNB, + &d_cell, &atomStart, &nAtoms); diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 77119044cb..dfe8c4e936 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -158,17 +158,43 @@ gmx::ArrayRef nonbonded_verlet_t::getGridIndices() const return pairSearch_->gridSet().cells(); } +void +nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality locality, + rvec *f) +{ + + /* Skip the reduction if there was no short-range GPU work to do + * (either NB or both NB and bonded work). */ + if (!pairlistIsSimple() && !haveGpuShortRangeWork(locality)) + { + return; + } + + wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); + wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS); + + reduceForces(nbat.get(), locality, pairSearch_->gridSet(), f, nullptr, nullptr, gpu_nbv, false, false); + + wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS); + wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); +} + void nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality locality, rvec *f, + void *fPmeDeviceBuffer, + GpuEventSynchronizer *pmeForcesReady, BufferOpsUseGpu useGpu, - GpuBufferOpsAccumulateForce accumulateForce) + bool useGpuFPmeReduction, + bool accumulateForce) { - GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) && - (accumulateForce == GpuBufferOpsAccumulateForce::True)), + GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) && accumulateForce), "Accumulatation of force is only valid when GPU buffer ops are active"); + GMX_ASSERT((useGpuFPmeReduction == (fPmeDeviceBuffer != nullptr)), + "GPU PME force reduction is only valid when a non-null GPU PME force pointer is available"); + /* Skip the reduction if there was no short-range GPU work to do * (either NB or both NB and bonded work). */ if (!pairlistIsSimple() && !haveGpuShortRangeWork(locality)) @@ -180,7 +206,7 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS); auto fn = useGpu == BufferOpsUseGpu::True ? reduceForces : reduceForces; - fn(nbat.get(), locality, pairSearch_->gridSet(), f, gpu_nbv, accumulateForce); + fn(nbat.get(), locality, pairSearch_->gridSet(), f, fPmeDeviceBuffer, pmeForcesReady, gpu_nbv, useGpuFPmeReduction, accumulateForce); wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS); wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index a749b074b9..cf0b2f62ab 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -135,14 +135,7 @@ enum class BufferOpsUseGpu False }; -/*! \brief Switch for whether forces should accumulate in GPU buffer ops */ -enum class GpuBufferOpsAccumulateForce -{ - True, // Force should be accumulated and format converted - False, // Force should be not accumulated, just format converted - Null // GPU buffer ops are not in use, so this object is not applicable -}; - +class GpuEventSynchronizer; namespace gmx { @@ -310,11 +303,29 @@ struct nonbonded_verlet_t int forceFlags, t_nrnb *nrnb); - //! Add the forces stored in nbat to f, zeros the forces in nbat */ + /*! \brief Add the forces stored in nbat to f, zeros the forces in nbat + * \param [in] locality Local or non-local + * \param [inout] f Force to be added to + */ + void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality locality, + rvec *f); + + /*! \brief Add the forces stored in nbat to f, allowing for possibility that GPU buffer ops are active + * \param [in] locality Local or non-local + * \param [inout] f Force to be added to + * \param [in] fPme Force from PME calculation + * \param [in] pmeForcesReady Event triggered when PME force calculation has completed + * \param [in] useGpu Whether GPU buffer ops are active + * \param [in] useGpuFPmeReduction Whether PME force reduction is on GPU + * \param [in] accumulateForce Whether force should be accumulated or stored + */ void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality locality, rvec *f, + void *fPme, + GpuEventSynchronizer *pmeForcesReady, BufferOpsUseGpu useGpu, - GpuBufferOpsAccumulateForce accumulateForce); + bool useGpuFPmeReduction, + bool accumulateForce); /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */ void atomdata_init_add_nbat_f_to_f_gpu(); diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index b213ff4c07..0f0e8de218 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -54,7 +54,6 @@ struct nbnxn_atomdata_t; enum class GpuTaskCompletion; -enum class GpuBufferOpsAccumulateForce; namespace gmx { @@ -280,9 +279,12 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused *cell, CUDA_FUNC_QUALIFIER void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality gmx_unused atomLocality, gmx_nbnxn_gpu_t gmx_unused *gpu_nbv, + void gmx_unused *fPmeDevicePtr, + GpuEventSynchronizer gmx_unused *pmeForcesReady, int gmx_unused atomStart, int gmx_unused nAtoms, - GpuBufferOpsAccumulateForce gmx_unused accumulateForce) CUDA_FUNC_TERM; + bool gmx_unused useGpuFPmeReduction, + bool gmx_unused accumulateForce) CUDA_FUNC_TERM; /*! \brief Copy force buffer from CPU to GPU */ CUDA_FUNC_QUALIFIER -- 2.22.0