From c4c2b46d9e3088511b9534dbfa65f6ef403885c8 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Fri, 30 Aug 2019 15:16:24 +0200 Subject: [PATCH] Pass the new gmx::ForceFlags to the nbnxm module - Changed in NB kernel dispatch - NB GPU transfer launch and wait Change-Id: Idd2738797ddcdd372e90cdfcc066b056a29d8de2 --- src/gromacs/mdlib/sim_util.cpp | 37 ++++++++-------- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 21 ++++----- src/gromacs/nbnxm/gpu_common.h | 17 +++----- src/gromacs/nbnxm/kerneldispatch.cpp | 9 ++-- src/gromacs/nbnxm/nbnxm.h | 3 +- src/gromacs/nbnxm/nbnxm_gpu.h | 59 +++++++++++++------------- src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp | 29 ++++++------- 7 files changed, 83 insertions(+), 92 deletions(-) diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 171675e914..61ca2b0758 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -314,14 +314,15 @@ static void post_process_forces(const t_commrec *cr, static void do_nb_verlet(t_forcerec *fr, const interaction_const_t *ic, gmx_enerdata_t *enerd, - const int flags, + int legacyForceFlags, + const gmx::ForceFlags &forceFlags, const Nbnxm::InteractionLocality ilocality, const int clearF, const int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle) { - if (!(flags & GMX_FORCE_NONBONDED)) + if (!(legacyForceFlags & GMX_FORCE_NONBONDED)) { /* skip non-bonded calculation */ return; @@ -351,7 +352,7 @@ static void do_nb_verlet(t_forcerec *fr, } } - nbv->dispatchNonbondedKernel(ilocality, *ic, flags, clearF, *fr, enerd, nrnb); + nbv->dispatchNonbondedKernel(ilocality, *ic, legacyForceFlags, forceFlags, clearF, *fr, enerd, nrnb); } static inline void clear_rvecs_omp(int n, rvec v[]) @@ -644,7 +645,7 @@ static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata, * \param[in,out] pmedata PME module data * \param[in,out] forceOutputs Output buffer for the forces and virial * \param[in,out] enerd Energy data structure results are reduced into - * \param[in] flags Force flags + * \param[in] forceFlags Force schedule flags * \param[in] pmeFlags PME flags * \param[in] wcycle The wallcycle structure */ @@ -652,7 +653,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv gmx_pme_t *pmedata, gmx::ForceOutputs *forceOutputs, gmx_enerdata_t *enerd, - int flags, + const gmx::ForceFlags &forceFlags, int pmeFlags, gmx_wallcycle_t wcycle) { @@ -678,7 +679,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv { GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check; isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv, - flags, // FIXME remove this + forceFlags, Nbnxm::AtomLocality::Local, enerd->grpp.ener[egLJSR].data(), enerd->grpp.ener[egCOULSR].data(), @@ -1149,7 +1150,7 @@ void do_force(FILE *fplog, /* launch local nonbonded work on GPU */ wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED); - do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFNo, + do_nb_verlet(fr, ic, enerd, flags, forceFlags, Nbnxm::InteractionLocality::Local, enbvClearFNo, step, nrnb, wcycle); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); wallcycle_stop(wcycle, ewcLAUNCH_GPU); @@ -1211,7 +1212,7 @@ void do_force(FILE *fplog, /* launch non-local nonbonded tasks on GPU */ wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED); - do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo, + do_nb_verlet(fr, ic, enerd, flags, forceFlags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); @@ -1230,12 +1231,10 @@ void do_force(FILE *fplog, if (havePPDomainDecomposition(cr)) { Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), - // FIXME - flags, Nbnxm::AtomLocality::NonLocal, copyBackNbForce); + forceFlags, Nbnxm::AtomLocality::NonLocal, copyBackNbForce); } Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), - // FIXME - flags, Nbnxm::AtomLocality::Local, copyBackNbForce); + forceFlags, Nbnxm::AtomLocality::Local, copyBackNbForce); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); if (forceWork.haveGpuBondedWork && forceFlags.computeEnergy) @@ -1317,7 +1316,7 @@ void do_force(FILE *fplog, if (!bUseOrEmulGPU) { - do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, enbvClearFYes, + do_nb_verlet(fr, ic, enerd, flags, forceFlags, Nbnxm::InteractionLocality::Local, enbvClearFYes, step, nrnb, wcycle); } @@ -1344,7 +1343,7 @@ void do_force(FILE *fplog, { if (havePPDomainDecomposition(cr)) { - do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo, + do_nb_verlet(fr, ic, enerd, flags, forceFlags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle); } @@ -1405,7 +1404,7 @@ void do_force(FILE *fplog, if (bUseGPU) { cycles_wait_gpu += Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv, - flags, Nbnxm::AtomLocality::NonLocal, + forceFlags, Nbnxm::AtomLocality::NonLocal, enerd->grpp.ener[egLJSR].data(), enerd->grpp.ener[egCOULSR].data(), forceWithShiftForces.shiftForces(), @@ -1414,7 +1413,7 @@ void do_force(FILE *fplog, else { wallcycle_start_nocount(wcycle, ewcFORCE); - do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes, + do_nb_verlet(fr, ic, enerd, flags, forceFlags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes, step, nrnb, wcycle); wallcycle_stop(wcycle, ewcFORCE); } @@ -1470,7 +1469,7 @@ void do_force(FILE *fplog, if (alternateGpuWait) { alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd, - flags, pmeFlags, wcycle); + forceFlags, pmeFlags, wcycle); } if (!alternateGpuWait && useGpuPme) @@ -1489,7 +1488,7 @@ void do_force(FILE *fplog, const float gpuWaitApiOverheadMargin = 2e6F; /* cycles */ const float waitCycles = Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv, - flags, Nbnxm::AtomLocality::Local, + forceFlags, Nbnxm::AtomLocality::Local, enerd->grpp.ener[egLJSR].data(), enerd->grpp.ener[egCOULSR].data(), forceOut.forceWithShiftForces().shiftForces(), @@ -1517,7 +1516,7 @@ void do_force(FILE *fplog, // NOTE: emulation kernel is not included in the balancing region, // but emulation mode does not target performance anyway wallcycle_start_nocount(wcycle, ewcFORCE); - do_nb_verlet(fr, ic, enerd, flags, Nbnxm::InteractionLocality::Local, + do_nb_verlet(fr, ic, enerd, flags, forceFlags, Nbnxm::InteractionLocality::Local, DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes, step, nrnb, wcycle); wallcycle_stop(wcycle, ewcFORCE); diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 74925428ac..0b30c010b0 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -56,7 +56,7 @@ #include "gromacs/gpu_utils/cudautils.cuh" #include "gromacs/gpu_utils/gpueventsynchronizer.cuh" #include "gromacs/gpu_utils/vectype_ops.cuh" -#include "gromacs/mdlib/force_flags.h" +#include "gromacs/mdlib/ppforceworkload.h" #include "gromacs/nbnxm/atomdata.h" #include "gromacs/nbnxm/gpu_common.h" #include "gromacs/nbnxm/gpu_common_utils.h" @@ -403,7 +403,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_cuda_t *nb, with this event in the non-local stream before launching the non-bonded kernel. */ void gpu_launch_kernel(gmx_nbnxn_cuda_t *nb, - const int flags, + const gmx::ForceFlags &forceFlags, const InteractionLocality iloc) { cu_atomdata_t *adat = nb->atdat; @@ -412,8 +412,6 @@ void gpu_launch_kernel(gmx_nbnxn_cuda_t *nb, cu_timers_t *t = nb->timers; cudaStream_t stream = nb->stream[iloc]; - bool bCalcEner = flags & GMX_FORCE_ENERGY; - bool bCalcFshift = flags & GMX_FORCE_VIRIAL; bool bDoTime = nb->bDoTime; /* Don't launch the non-local kernel if there is no work to do. @@ -488,10 +486,10 @@ void gpu_launch_kernel(gmx_nbnxn_cuda_t *nb, auto *timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr; const auto kernel = select_nbnxn_kernel(nbp->eeltype, nbp->vdwtype, - bCalcEner, + forceFlags.computeEnergy, (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune), nb->dev_info); - const auto kernelArgs = prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &bCalcFshift); + const auto kernelArgs = prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &forceFlags.computeVirial); launchGpuKernel(kernel, config, timingEvent, "k_calc_nb", kernelArgs); if (bDoTime) @@ -645,7 +643,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t *nb, void gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, nbnxn_atomdata_t *nbatom, - const int flags, + const gmx::ForceFlags &forceFlags, const AtomLocality atomLocality, const bool copyBackNbForce) { @@ -663,9 +661,6 @@ void gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, bool bDoTime = nb->bDoTime; cudaStream_t stream = nb->stream[iloc]; - bool bCalcEner = flags & GMX_FORCE_ENERGY; - bool bCalcFshift = flags & GMX_FORCE_VIRIAL; - /* don't launch non-local copy-back if there was no non-local work to do */ if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) { @@ -708,15 +703,15 @@ void gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, /* only transfer energies in the local stream */ if (iloc == InteractionLocality::Local) { - /* DtoH fshift */ - if (bCalcFshift) + /* DtoH fshift when virial is needed */ + if (forceFlags.computeVirial) { cu_copy_D2H_async(nb->nbst.fshift, adat->fshift, SHIFTS * sizeof(*nb->nbst.fshift), stream); } /* DtoH energies */ - if (bCalcEner) + if (forceFlags.computeEnergy) { cu_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, sizeof(*nb->nbst.e_lj), stream); diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h index 3d7871c969..599c97edd4 100644 --- a/src/gromacs/nbnxm/gpu_common.h +++ b/src/gromacs/nbnxm/gpu_common.h @@ -58,7 +58,7 @@ #include "gromacs/gpu_utils/gpu_utils.h" #include "gromacs/listed_forces/gpubonded.h" #include "gromacs/math/vec.h" -#include "gromacs/mdlib/force_flags.h" +#include "gromacs/mdlib/ppforceworkload.h" #include "gromacs/nbnxm/nbnxm.h" #include "gromacs/pbcutil/ishift.h" #include "gromacs/timing/gpu_timing.h" @@ -367,7 +367,7 @@ gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t *timings, //TODO: move into shared source file with gmx_compile_cpp_as_cuda //NOLINTNEXTLINE(misc-definitions-in-headers) bool gpu_try_finish_task(gmx_nbnxn_gpu_t *nb, - const int flags, + const gmx::ForceFlags &forceFlags, const AtomLocality aloc, real *e_lj, real *e_el, @@ -410,13 +410,10 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t *nb, gpuStreamSynchronize(nb->stream[iLocality]); } - bool calcEner = (flags & GMX_FORCE_ENERGY) != 0; - bool calcFshift = (flags & GMX_FORCE_VIRIAL) != 0; - - gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, calcEner, + gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, forceFlags.computeEnergy, nb->bDoTime != 0); - gpu_reduce_staged_outputs(nb->nbst, iLocality, calcEner, calcFshift, + gpu_reduce_staged_outputs(nb->nbst, iLocality, forceFlags.computeEnergy, forceFlags.computeVirial, e_lj, e_el, as_rvec_array(shiftForces.data())); } @@ -438,7 +435,7 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t *nb, * pruning flags. * * \param[in] nb The nonbonded data GPU structure - * \param[in] flags Force flags + * \param[in] forceFlags Force schedule flags * \param[in] aloc Atom locality identifier * \param[out] e_lj Pointer to the LJ energy output to accumulate into * \param[out] e_el Pointer to the electrostatics energy output to accumulate into @@ -448,7 +445,7 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t *nb, */ //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file float gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb, - int flags, + const gmx::ForceFlags &forceFlags, AtomLocality aloc, real *e_lj, real *e_el, @@ -459,7 +456,7 @@ float gpu_wait_finish_task(gmx_nbnxn_gpu_t *nb, (gpuAtomToInteractionLocality(aloc) == InteractionLocality::Local) ? ewcWAIT_GPU_NB_L : ewcWAIT_GPU_NB_NL; wallcycle_start(wcycle, cycleCounter); - gpu_try_finish_task(nb, flags, aloc, e_lj, e_el, shiftForces, + gpu_try_finish_task(nb, forceFlags, aloc, e_lj, e_el, shiftForces, GpuTaskCompletion::Wait, wcycle); float waitTime = wallcycle_stop(wcycle, cycleCounter); diff --git a/src/gromacs/nbnxm/kerneldispatch.cpp b/src/gromacs/nbnxm/kerneldispatch.cpp index c608a660c2..1303a20e48 100644 --- a/src/gromacs/nbnxm/kerneldispatch.cpp +++ b/src/gromacs/nbnxm/kerneldispatch.cpp @@ -464,7 +464,8 @@ static void accountFlops(t_nrnb *nrnb, void nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality, const interaction_const_t &ic, - int forceFlags, + int legacyForceFlags, + const gmx::ForceFlags &forceFlags, int clearF, const t_forcerec &fr, gmx_enerdata_t *enerd, @@ -482,7 +483,7 @@ nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality nbat.get(), ic, fr.shift_vec, - forceFlags, + legacyForceFlags, clearF, enerd->grpp.ener[egCOULSR].data(), fr.bBHAM ? @@ -499,7 +500,7 @@ nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality nbnxn_kernel_gpu_ref(pairlistSet.gpuList(), nbat.get(), &ic, fr.shift_vec, - forceFlags, + legacyForceFlags, clearF, nbat->out[0].f, nbat->out[0].fshift.data(), @@ -514,7 +515,7 @@ nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality } - accountFlops(nrnb, pairlistSet, *this, ic, forceFlags); + accountFlops(nrnb, pairlistSet, *this, ic, legacyForceFlags); } void diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index 86394f02ae..5a6ff9dea4 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -286,7 +286,8 @@ struct nonbonded_verlet_t //! \brief Executes the non-bonded kernel of the GPU or launches it on the GPU void dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality, const interaction_const_t &ic, - int forceFlags, + int legacyForceFlags, + const gmx::ForceFlags &forceFlags, int clearF, const t_forcerec &fr, gmx_enerdata_t *enerd, diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index 0fbd9e4690..08f0aa9f96 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -59,6 +59,7 @@ enum class GpuTaskCompletion; namespace gmx { class GpuBonded; +class ForceFlags; } namespace Nbnxm @@ -92,9 +93,9 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_gpu_t gmx_unused *nb, * */ GPU_FUNC_QUALIFIER -void gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused *nb, - int gmx_unused flags, - InteractionLocality gmx_unused iloc) GPU_FUNC_TERM; +void gpu_launch_kernel(gmx_nbnxn_gpu_t gmx_unused *nb, + const gmx::ForceFlags gmx_unused &forceFlags, + InteractionLocality gmx_unused iloc) GPU_FUNC_TERM; /*! \brief * Launch asynchronously the nonbonded prune-only kernel. @@ -141,11 +142,11 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t gmx_unused *nb, * (and energies/shift forces if required). */ GPU_FUNC_QUALIFIER -void gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb, - nbnxn_atomdata_t gmx_unused *nbatom, - int gmx_unused flags, - AtomLocality gmx_unused aloc, - bool gmx_unused copyBackNbForce) GPU_FUNC_TERM; +void gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb, + nbnxn_atomdata_t gmx_unused *nbatom, + const gmx::ForceFlags gmx_unused &forceFlags, + AtomLocality gmx_unused aloc, + bool gmx_unused copyBackNbForce) GPU_FUNC_TERM; /*! \brief Attempts to complete nonbonded GPU task. * @@ -174,25 +175,25 @@ void gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb, * force buffer (instead of that being passed only to nbnxn_gpu_launch_cpyback()) and by returning * the energy and Fshift contributions for some external/centralized reduction. * - * \param[in] nb The nonbonded data GPU structure - * \param[in] flags Force flags - * \param[in] aloc Atom locality identifier - * \param[out] e_lj Pointer to the LJ energy output to accumulate into - * \param[out] e_el Pointer to the electrostatics energy output to accumulate into + * \param[in] nb The nonbonded data GPU structure + * \param[in] forceFlags Force schedule flags + * \param[in] aloc Atom locality identifier + * \param[out] e_lj Pointer to the LJ energy output to accumulate into + * \param[out] e_el Pointer to the electrostatics energy output to accumulate into * \param[out] shiftForces Shift forces buffer to accumulate into * \param[in] completionKind Indicates whether nnbonded task completion should only be checked rather than waited for - * \param[out] wcycle Pointer to wallcycle data structure - * \returns True if the nonbonded tasks associated with \p aloc locality have completed + * \param[out] wcycle Pointer to wallcycle data structure + * \returns True if the nonbonded tasks associated with \p aloc locality have completed */ GPU_FUNC_QUALIFIER -bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb, - int gmx_unused flags, - AtomLocality gmx_unused aloc, - real gmx_unused *e_lj, - real gmx_unused *e_el, +bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb, + const gmx::ForceFlags gmx_unused &forceFlags, + AtomLocality gmx_unused aloc, + real gmx_unused *e_lj, + real gmx_unused *e_el, gmx::ArrayRef gmx_unused shiftForces, - GpuTaskCompletion gmx_unused completionKind, - gmx_wallcycle gmx_unused *wcycle) GPU_FUNC_TERM_WITH_RETURN(false); + GpuTaskCompletion gmx_unused completionKind, + gmx_wallcycle gmx_unused *wcycle) GPU_FUNC_TERM_WITH_RETURN(false); /*! \brief Completes the nonbonded GPU task blocking until GPU tasks and data * transfers to finish. @@ -202,7 +203,7 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb, * pruning flags. * * \param[in] nb The nonbonded data GPU structure - * \param[in] flags Force flags + * \param[in] forceFlags Force schedule flags * \param[in] aloc Atom locality identifier * \param[out] e_lj Pointer to the LJ energy output to accumulate into * \param[out] e_el Pointer to the electrostatics energy output to accumulate into @@ -210,12 +211,12 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb, */ GPU_FUNC_QUALIFIER float gpu_wait_finish_task(gmx_nbnxn_gpu_t gmx_unused *nb, - int gmx_unused flags, - AtomLocality gmx_unused aloc, - real gmx_unused *e_lj, - real gmx_unused *e_el, - gmx::ArrayRef gmx_unused shiftForces, - gmx_wallcycle gmx_unused *wcycle) GPU_FUNC_TERM_WITH_RETURN(0.0); + const gmx::ForceFlags gmx_unused &forceFlags, + AtomLocality gmx_unused aloc, + real gmx_unused *e_lj, + real gmx_unused *e_el, + gmx::ArrayRef gmx_unused shiftForces, + gmx_wallcycle gmx_unused *wcycle) GPU_FUNC_TERM_WITH_RETURN(0.0); /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */ GPU_FUNC_QUALIFIER diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index 99a7829e29..365b404ad6 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -72,7 +72,7 @@ #include "gromacs/gpu_utils/gputraits_ocl.h" #include "gromacs/gpu_utils/oclutils.h" #include "gromacs/hardware/hw_info.h" -#include "gromacs/mdlib/force_flags.h" +#include "gromacs/mdlib/ppforceworkload.h" #include "gromacs/nbnxm/atomdata.h" #include "gromacs/nbnxm/gpu_common.h" #include "gromacs/nbnxm/gpu_common_utils.h" @@ -468,7 +468,7 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t *nb, are finished and synchronize with this event in the non-local stream. */ void gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, - const int flags, + const gmx::ForceFlags &forceFlags, const Nbnxm::InteractionLocality iloc) { cl_atomdata_t *adat = nb->atdat; @@ -477,8 +477,6 @@ void gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, cl_timers_t *t = nb->timers; cl_command_queue stream = nb->stream[iloc]; - bool bCalcEner = (flags & GMX_FORCE_ENERGY) != 0; - int bCalcFshift = flags & GMX_FORCE_VIRIAL; bool bDoTime = (nb->bDoTime) != 0; cl_nbparam_params_t nbparams_params; @@ -548,17 +546,20 @@ void gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, const auto kernel = select_nbnxn_kernel(nb, nbp->eeltype, nbp->vdwtype, - bCalcEner, + forceFlags.computeEnergy, (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune)); + // The OpenCL kernel takes int as second to last argument because bool is + // not supported as a kernel argument type (sizeof(bool) is implementation defined). + const int computeFshift = forceFlags.computeVirial; if (useLjCombRule(nb->nbparam->vdwtype)) { const auto kernelArgs = prepareGpuKernelArguments(kernel, config, &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift, &adat->lj_comb, &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d, - &plist->sci, &plist->cj4, &plist->excl, &bCalcFshift); + &plist->sci, &plist->cj4, &plist->excl, &computeFshift); launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs); } @@ -569,7 +570,7 @@ void gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift, &adat->atom_types, &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d, - &plist->sci, &plist->cj4, &plist->excl, &bCalcFshift); + &plist->sci, &plist->cj4, &plist->excl, &computeFshift); launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs); } @@ -733,9 +734,9 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb, */ void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, struct nbnxn_atomdata_t *nbatom, - const int flags, + const gmx::ForceFlags &forceFlags, const AtomLocality aloc, - const bool gmx_unused copyBackNbForce) + const bool gmx_unused copyBackNbForce) { GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); @@ -750,10 +751,6 @@ void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, bool bDoTime = nb->bDoTime == CL_TRUE; cl_command_queue stream = nb->stream[iloc]; - bool bCalcEner = (flags & GMX_FORCE_ENERGY) != 0; - int bCalcFshift = flags & GMX_FORCE_VIRIAL; - - /* don't launch non-local copy-back if there was no non-local work to do */ if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) { @@ -806,15 +803,15 @@ void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, /* only transfer energies in the local stream */ if (iloc == InteractionLocality::Local) { - /* DtoH fshift */ - if (bCalcFshift) + /* DtoH fshift when virial is needed */ + if (forceFlags.computeVirial) { ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0, SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); } /* DtoH energies */ - if (bCalcEner) + if (forceFlags.computeEnergy) { ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0, sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); -- 2.22.0