From: Szilárd Páll Date: Mon, 29 Jul 2019 16:24:46 +0000 (+0200) Subject: Store wcycle pointer in nonbonded_verlet_t X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=ccc02b1e9bcfabd1fdf983f14accf949e47adb61;p=alexxy%2Fgromacs.git Store wcycle pointer in nonbonded_verlet_t Removed wcycle arguments from method calls that no longer need it and moved a few instances of cycle counting into the nbmxn module. Change-Id: Ic5646b3bb85ed2c66137e9db7bd70822df95042b --- diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp index af151e519d..db5bd9fd1d 100644 --- a/src/gromacs/mdlib/forcerec.cpp +++ b/src/gromacs/mdlib/forcerec.cpp @@ -1980,7 +1980,7 @@ void init_forcerec(FILE *fp, fr->nbv = Nbnxm::init_nb_verlet(mdlog, bFEP_NonBonded, ir, fr, cr, hardwareInfo, deviceInfo, - mtop, box); + mtop, box, wcycle); if (useGpuForBonded) { diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index e525fad297..70a25665a2 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -346,7 +346,7 @@ static void do_nb_verlet(t_forcerec *fr, } } - nbv->dispatchNonbondedKernel(ilocality, *ic, flags, clearF, *fr, enerd, nrnb, wcycle); + nbv->dispatchNonbondedKernel(ilocality, *ic, flags, clearF, *fr, enerd, nrnb); } static inline void clear_rvecs_omp(int n, rvec v[]) @@ -693,8 +693,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local, as_rvec_array(force->unpaddedArrayRef().data()), BufferOpsUseGpu::False, - GpuBufferOpsAccumulateForce::Null, - wcycle); + GpuBufferOpsAccumulateForce::Null); } } } @@ -831,11 +830,7 @@ launchGpuEndOfStepTasks(nonbonded_verlet_t *nbv, */ if (nbv->isDynamicPruningStepGpu(step)) { - wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); - wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED); nbv->dispatchPruneKernelGpu(step); - wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); - wallcycle_stop(wcycle, ewcLAUNCH_GPU); } /* now clear the GPU outputs while we finish the step on the CPU */ @@ -1108,13 +1103,13 @@ void do_force(FILE *fplog, // NS step is also a virial step (on which f buf ops are deactivated). if (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA)) { - nbv->atomdata_init_add_nbat_f_to_f_gpu(wcycle); + nbv->atomdata_init_add_nbat_f_to_f_gpu(); } } else { nbv->setCoordinates(Nbnxm::AtomLocality::Local, false, - x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata), wcycle); + x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata)); } if (bUseGPU) @@ -1181,7 +1176,7 @@ void do_force(FILE *fplog, dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle); nbv->setCoordinates(Nbnxm::AtomLocality::NonLocal, false, - x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata), wcycle); + x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata)); } @@ -1318,14 +1313,14 @@ void do_force(FILE *fplog, nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::Local, fr, as_rvec_array(x.unpaddedArrayRef().data()), forceOut.f, *mdatoms, inputrec->fepvals, lambda.data(), - enerd, flags, nrnb, wcycle); + enerd, flags, nrnb); if (havePPDomainDecomposition(cr)) { nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::NonLocal, fr, as_rvec_array(x.unpaddedArrayRef().data()), forceOut.f, *mdatoms, inputrec->fepvals, lambda.data(), - enerd, flags, nrnb, wcycle); + enerd, flags, nrnb); } } @@ -1344,8 +1339,7 @@ void do_force(FILE *fplog, wallcycle_stop(wcycle, ewcFORCE); nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f, BufferOpsUseGpu::False, - GpuBufferOpsAccumulateForce::Null, - wcycle); + GpuBufferOpsAccumulateForce::Null); wallcycle_start_nocount(wcycle, ewcFORCE); @@ -1427,7 +1421,7 @@ void do_force(FILE *fplog, nbv->launch_copy_f_to_gpu(forceOut.f, Nbnxm::AtomLocality::NonLocal); } nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal, - forceOut.f, useGpuFBufOps, accumulateForce, wcycle); + forceOut.f, useGpuFBufOps, accumulateForce); if (useGpuFBufOps == BufferOpsUseGpu::True) { nbv->launch_copy_f_from_gpu(forceOut.f, Nbnxm::AtomLocality::NonLocal); @@ -1536,7 +1530,7 @@ void do_force(FILE *fplog, nbv->launch_copy_f_to_gpu(forceOut.f, Nbnxm::AtomLocality::Local); } nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local, - forceOut.f, useGpuFBufOps, accumulateForce, wcycle); + forceOut.f, useGpuFBufOps, accumulateForce); if (useGpuFBufOps == BufferOpsUseGpu::True) { nbv->launch_copy_f_from_gpu(forceOut.f, Nbnxm::AtomLocality::Local); diff --git a/src/gromacs/nbnxm/kerneldispatch.cpp b/src/gromacs/nbnxm/kerneldispatch.cpp index 99b96d11b6..a600029a2b 100644 --- a/src/gromacs/nbnxm/kerneldispatch.cpp +++ b/src/gromacs/nbnxm/kerneldispatch.cpp @@ -467,8 +467,7 @@ nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality int clearF, const t_forcerec &fr, gmx_enerdata_t *enerd, - t_nrnb *nrnb, - gmx_wallcycle *wcycle) + t_nrnb *nrnb) { const PairlistSet &pairlistSet = pairlistSets().pairlistSet(iLocality); @@ -488,7 +487,7 @@ nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality fr.bBHAM ? enerd->grpp.ener[egBHAMSR].data() : enerd->grpp.ener[egLJSR].data(), - wcycle); + wcycle_); break; case Nbnxm::KernelType::Gpu8x8x8: @@ -527,8 +526,7 @@ nonbonded_verlet_t::dispatchFreeEnergyKernel(Nbnxm::InteractionLocality iLocali real *lambda, gmx_enerdata_t *enerd, const int forceFlags, - t_nrnb *nrnb, - gmx_wallcycle *wcycle) + t_nrnb *nrnb) { const auto nbl_fep = pairlistSets().pairlistSet(iLocality).fepLists(); @@ -567,7 +565,7 @@ nonbonded_verlet_t::dispatchFreeEnergyKernel(Nbnxm::InteractionLocality iLocali GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_fep.ssize(), "Number of lists should be same as number of NB threads"); - wallcycle_sub_start(wcycle, ewcsNONBONDED_FEP); + wallcycle_sub_start(wcycle_, ewcsNONBONDED_FEP); #pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize()) for (int th = 0; th < nbl_fep.ssize(); th++) { @@ -624,5 +622,5 @@ nonbonded_verlet_t::dispatchFreeEnergyKernel(Nbnxm::InteractionLocality iLocali enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT]; } } - wallcycle_sub_stop(wcycle, ewcsNONBONDED_FEP); + wallcycle_sub_stop(wcycle_, ewcsNONBONDED_FEP); } diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index 681dcf2d8b..77119044cb 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -136,11 +136,10 @@ void nonbonded_verlet_t::setCoordinates(const Nbnxm::AtomLocality locality const bool fillLocal, gmx::ArrayRef x, BufferOpsUseGpu useGpu, - void *xPmeDevicePtr, - gmx_wallcycle *wcycle) + void *xPmeDevicePtr) { - wallcycle_start(wcycle, ewcNB_XF_BUF_OPS); - wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS); + wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); + wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS); auto fnPtr = (useGpu == BufferOpsUseGpu::True) ? nbnxn_atomdata_copy_x_to_nbat_x : @@ -150,8 +149,8 @@ void nonbonded_verlet_t::setCoordinates(const Nbnxm::AtomLocality locality as_rvec_array(x.data()), nbat.get(), gpu_nbv, xPmeDevicePtr); - wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS); - wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS); + wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS); + wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); } gmx::ArrayRef nonbonded_verlet_t::getGridIndices() const @@ -163,8 +162,7 @@ void nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality locality, rvec *f, BufferOpsUseGpu useGpu, - GpuBufferOpsAccumulateForce accumulateForce, - gmx_wallcycle *wcycle) + GpuBufferOpsAccumulateForce accumulateForce) { GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) && @@ -178,22 +176,22 @@ nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality return; } - wallcycle_start(wcycle, ewcNB_XF_BUF_OPS); - wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS); + wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); + wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS); auto fn = useGpu == BufferOpsUseGpu::True ? reduceForces : reduceForces; fn(nbat.get(), locality, pairSearch_->gridSet(), f, gpu_nbv, accumulateForce); - wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS); - wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS); + wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS); + wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); } void -nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(gmx_wallcycle *wcycle) +nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu() { - wallcycle_start(wcycle, ewcNB_XF_BUF_OPS); - wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS); + wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS); + wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS); const Nbnxm::GridSet &gridSet = pairSearch_->gridSet(); @@ -201,8 +199,8 @@ nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(gmx_wallcycle *wcycle) gpu_nbv, gridSet.numRealAtomsTotal()); - wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS); - wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS); + wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS); + wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS); } real nonbonded_verlet_t::pairlistInnerRadius() const diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index 7a6491a4aa..a749b074b9 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -212,7 +212,8 @@ struct nonbonded_verlet_t std::unique_ptr pairSearch, std::unique_ptr nbat, const Nbnxm::KernelSetup &kernelSetup, - gmx_nbnxn_gpu_t *gpu_nbv); + gmx_nbnxn_gpu_t *gpu_nbv, + gmx_wallcycle *wcycle); ~nonbonded_verlet_t(); @@ -261,8 +262,7 @@ struct nonbonded_verlet_t bool fillLocal, gmx::ArrayRef x, BufferOpsUseGpu useGpu, - void *xPmeDevicePtr, - gmx_wallcycle *wcycle); + void *xPmeDevicePtr); //! Init for GPU version of setup coordinates in Nbnxm void atomdata_init_copy_x_to_nbat_x_gpu(); @@ -296,8 +296,7 @@ struct nonbonded_verlet_t int clearF, const t_forcerec &fr, gmx_enerdata_t *enerd, - t_nrnb *nrnb, - gmx_wallcycle *wcycle); + t_nrnb *nrnb); //! Executes the non-bonded free-energy kernel, always runs on the CPU void dispatchFreeEnergyKernel(Nbnxm::InteractionLocality iLocality, @@ -309,18 +308,16 @@ struct nonbonded_verlet_t real *lambda, gmx_enerdata_t *enerd, int forceFlags, - t_nrnb *nrnb, - gmx_wallcycle *wcycle); + t_nrnb *nrnb); //! Add the forces stored in nbat to f, zeros the forces in nbat */ void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality locality, rvec *f, BufferOpsUseGpu useGpu, - GpuBufferOpsAccumulateForce accumulateForce, - gmx_wallcycle *wcycle); + GpuBufferOpsAccumulateForce accumulateForce); /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */ - void atomdata_init_add_nbat_f_to_f_gpu(gmx_wallcycle *wcycle); + void atomdata_init_add_nbat_f_to_f_gpu(); /*! \brief H2D transfer of force buffer*/ void launch_copy_f_to_gpu(rvec *f, Nbnxm::AtomLocality locality); @@ -378,6 +375,8 @@ struct nonbonded_verlet_t private: //! The non-bonded setup, also affects the pairlist construction kernel Nbnxm::KernelSetup kernelSetup_; + //! \brief Pointer to wallcycle structure. + gmx_wallcycle *wcycle_; public: //! GPU Nbnxm data, only used with a physical GPU (TODO: use unique_ptr) gmx_nbnxn_gpu_t *gpu_nbv; @@ -396,7 +395,8 @@ init_nb_verlet(const gmx::MDLogger &mdlog, const gmx_hw_info_t &hardwareInfo, const gmx_device_info_t *deviceInfo, const gmx_mtop_t *mtop, - matrix box); + matrix box, + gmx_wallcycle *wcycle); } // namespace Nbnxm diff --git a/src/gromacs/nbnxm/nbnxm_setup.cpp b/src/gromacs/nbnxm/nbnxm_setup.cpp index 18c8182dd5..6715fef9c2 100644 --- a/src/gromacs/nbnxm/nbnxm_setup.cpp +++ b/src/gromacs/nbnxm/nbnxm_setup.cpp @@ -364,7 +364,8 @@ init_nb_verlet(const gmx::MDLogger &mdlog, const gmx_hw_info_t &hardwareInfo, const gmx_device_info_t *deviceInfo, const gmx_mtop_t *mtop, - matrix box) + matrix box, + gmx_wallcycle *wcycle) { const bool emulateGpu = (getenv("GMX_EMULATE_GPU") != nullptr); const bool useGpu = deviceInfo != nullptr; @@ -482,7 +483,8 @@ init_nb_verlet(const gmx::MDLogger &mdlog, std::move(pairSearch), std::move(nbat), kernelSetup, - gpu_nbv); + gpu_nbv, + wcycle); } } // namespace Nbnxm @@ -491,11 +493,13 @@ nonbonded_verlet_t::nonbonded_verlet_t(std::unique_ptr pairli std::unique_ptr pairSearch, std::unique_ptr nbat_in, const Nbnxm::KernelSetup &kernelSetup, - gmx_nbnxn_gpu_t *gpu_nbv_ptr) : + gmx_nbnxn_gpu_t *gpu_nbv_ptr, + gmx_wallcycle *wcycle) : pairlistSets_(std::move(pairlistSets)), pairSearch_(std::move(pairSearch)), nbat(std::move(nbat_in)), kernelSetup_(kernelSetup), + wcycle_(wcycle), gpu_nbv(gpu_nbv_ptr) { GMX_RELEASE_ASSERT(pairlistSets_, "Need valid pairlistSets"); diff --git a/src/gromacs/nbnxm/prunekerneldispatch.cpp b/src/gromacs/nbnxm/prunekerneldispatch.cpp index 899b9489be..cc5e3d08d9 100644 --- a/src/gromacs/nbnxm/prunekerneldispatch.cpp +++ b/src/gromacs/nbnxm/prunekerneldispatch.cpp @@ -37,6 +37,7 @@ #include "gromacs/mdlib/gmx_omp_nthreads.h" #include "gromacs/nbnxm/nbnxm.h" +#include "gromacs/timing/wallcycle.h" #include "gromacs/utility/gmxassert.h" #include "clusterdistancekerneltype.h" @@ -97,9 +98,15 @@ nonbonded_verlet_t::dispatchPruneKernelCpu(const Nbnxm::InteractionLocality iLo void nonbonded_verlet_t::dispatchPruneKernelGpu(int64_t step) { + wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU); + wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_NONBONDED); + const bool stepIsEven = (pairlistSets().numStepsWithPairlist(step) % 2 == 0); Nbnxm::gpu_launch_kernel_pruneonly(gpu_nbv, stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal, pairlistSets().params().numRollingPruningParts); + + wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NONBONDED); + wallcycle_stop(wcycle_, ewcLAUNCH_GPU); }