fr->nbv = Nbnxm::init_nb_verlet(mdlog, bFEP_NonBonded, ir, fr,
cr, hardwareInfo, deviceInfo,
- mtop, box);
+ mtop, box, wcycle);
if (useGpuForBonded)
{
}
}
- nbv->dispatchNonbondedKernel(ilocality, *ic, flags, clearF, *fr, enerd, nrnb, wcycle);
+ nbv->dispatchNonbondedKernel(ilocality, *ic, flags, clearF, *fr, enerd, nrnb);
}
static inline void clear_rvecs_omp(int n, rvec v[])
nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
as_rvec_array(force->unpaddedArrayRef().data()),
BufferOpsUseGpu::False,
- GpuBufferOpsAccumulateForce::Null,
- wcycle);
+ GpuBufferOpsAccumulateForce::Null);
}
}
}
*/
if (nbv->isDynamicPruningStepGpu(step))
{
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
- wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
nbv->dispatchPruneKernelGpu(step);
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
/* now clear the GPU outputs while we finish the step on the CPU */
// NS step is also a virial step (on which f buf ops are deactivated).
if (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA))
{
- nbv->atomdata_init_add_nbat_f_to_f_gpu(wcycle);
+ nbv->atomdata_init_add_nbat_f_to_f_gpu();
}
}
else
{
nbv->setCoordinates(Nbnxm::AtomLocality::Local, false,
- x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata), wcycle);
+ x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata));
}
if (bUseGPU)
dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
nbv->setCoordinates(Nbnxm::AtomLocality::NonLocal, false,
- x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata), wcycle);
+ x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata));
}
nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::Local,
fr, as_rvec_array(x.unpaddedArrayRef().data()), forceOut.f, *mdatoms,
inputrec->fepvals, lambda.data(),
- enerd, flags, nrnb, wcycle);
+ enerd, flags, nrnb);
if (havePPDomainDecomposition(cr))
{
nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::NonLocal,
fr, as_rvec_array(x.unpaddedArrayRef().data()), forceOut.f, *mdatoms,
inputrec->fepvals, lambda.data(),
- enerd, flags, nrnb, wcycle);
+ enerd, flags, nrnb);
}
}
wallcycle_stop(wcycle, ewcFORCE);
nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f,
BufferOpsUseGpu::False,
- GpuBufferOpsAccumulateForce::Null,
- wcycle);
+ GpuBufferOpsAccumulateForce::Null);
wallcycle_start_nocount(wcycle, ewcFORCE);
nbv->launch_copy_f_to_gpu(forceOut.f, Nbnxm::AtomLocality::NonLocal);
}
nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal,
- forceOut.f, useGpuFBufOps, accumulateForce, wcycle);
+ forceOut.f, useGpuFBufOps, accumulateForce);
if (useGpuFBufOps == BufferOpsUseGpu::True)
{
nbv->launch_copy_f_from_gpu(forceOut.f, Nbnxm::AtomLocality::NonLocal);
nbv->launch_copy_f_to_gpu(forceOut.f, Nbnxm::AtomLocality::Local);
}
nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
- forceOut.f, useGpuFBufOps, accumulateForce, wcycle);
+ forceOut.f, useGpuFBufOps, accumulateForce);
if (useGpuFBufOps == BufferOpsUseGpu::True)
{
nbv->launch_copy_f_from_gpu(forceOut.f, Nbnxm::AtomLocality::Local);
int clearF,
const t_forcerec &fr,
gmx_enerdata_t *enerd,
- t_nrnb *nrnb,
- gmx_wallcycle *wcycle)
+ t_nrnb *nrnb)
{
const PairlistSet &pairlistSet = pairlistSets().pairlistSet(iLocality);
fr.bBHAM ?
enerd->grpp.ener[egBHAMSR].data() :
enerd->grpp.ener[egLJSR].data(),
- wcycle);
+ wcycle_);
break;
case Nbnxm::KernelType::Gpu8x8x8:
real *lambda,
gmx_enerdata_t *enerd,
const int forceFlags,
- t_nrnb *nrnb,
- gmx_wallcycle *wcycle)
+ t_nrnb *nrnb)
{
const auto nbl_fep = pairlistSets().pairlistSet(iLocality).fepLists();
GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_fep.ssize(), "Number of lists should be same as number of NB threads");
- wallcycle_sub_start(wcycle, ewcsNONBONDED_FEP);
+ wallcycle_sub_start(wcycle_, ewcsNONBONDED_FEP);
#pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize())
for (int th = 0; th < nbl_fep.ssize(); th++)
{
enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
}
}
- wallcycle_sub_stop(wcycle, ewcsNONBONDED_FEP);
+ wallcycle_sub_stop(wcycle_, ewcsNONBONDED_FEP);
}
const bool fillLocal,
gmx::ArrayRef<const gmx::RVec> x,
BufferOpsUseGpu useGpu,
- void *xPmeDevicePtr,
- gmx_wallcycle *wcycle)
+ void *xPmeDevicePtr)
{
- wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
- wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+ wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
auto fnPtr = (useGpu == BufferOpsUseGpu::True) ?
nbnxn_atomdata_copy_x_to_nbat_x<true> :
as_rvec_array(x.data()),
nbat.get(), gpu_nbv, xPmeDevicePtr);
- wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
- wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
+ wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
}
gmx::ArrayRef<const int> nonbonded_verlet_t::getGridIndices() const
nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality locality,
rvec *f,
BufferOpsUseGpu useGpu,
- GpuBufferOpsAccumulateForce accumulateForce,
- gmx_wallcycle *wcycle)
+ GpuBufferOpsAccumulateForce accumulateForce)
{
GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) &&
return;
}
- wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
- wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
auto fn = useGpu == BufferOpsUseGpu::True ? reduceForces<true> : reduceForces<false>;
fn(nbat.get(), locality, pairSearch_->gridSet(), f, gpu_nbv, accumulateForce);
- wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
- wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
+ wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
}
void
-nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(gmx_wallcycle *wcycle)
+nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu()
{
- wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
- wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
const Nbnxm::GridSet &gridSet = pairSearch_->gridSet();
gpu_nbv,
gridSet.numRealAtomsTotal());
- wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
- wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
+ wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
}
real nonbonded_verlet_t::pairlistInnerRadius() const
std::unique_ptr<PairSearch> pairSearch,
std::unique_ptr<nbnxn_atomdata_t> nbat,
const Nbnxm::KernelSetup &kernelSetup,
- gmx_nbnxn_gpu_t *gpu_nbv);
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ gmx_wallcycle *wcycle);
~nonbonded_verlet_t();
bool fillLocal,
gmx::ArrayRef<const gmx::RVec> x,
BufferOpsUseGpu useGpu,
- void *xPmeDevicePtr,
- gmx_wallcycle *wcycle);
+ void *xPmeDevicePtr);
//! Init for GPU version of setup coordinates in Nbnxm
void atomdata_init_copy_x_to_nbat_x_gpu();
int clearF,
const t_forcerec &fr,
gmx_enerdata_t *enerd,
- t_nrnb *nrnb,
- gmx_wallcycle *wcycle);
+ t_nrnb *nrnb);
//! Executes the non-bonded free-energy kernel, always runs on the CPU
void dispatchFreeEnergyKernel(Nbnxm::InteractionLocality iLocality,
real *lambda,
gmx_enerdata_t *enerd,
int forceFlags,
- t_nrnb *nrnb,
- gmx_wallcycle *wcycle);
+ t_nrnb *nrnb);
//! Add the forces stored in nbat to f, zeros the forces in nbat */
void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality locality,
rvec *f,
BufferOpsUseGpu useGpu,
- GpuBufferOpsAccumulateForce accumulateForce,
- gmx_wallcycle *wcycle);
+ GpuBufferOpsAccumulateForce accumulateForce);
/*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
- void atomdata_init_add_nbat_f_to_f_gpu(gmx_wallcycle *wcycle);
+ void atomdata_init_add_nbat_f_to_f_gpu();
/*! \brief H2D transfer of force buffer*/
void launch_copy_f_to_gpu(rvec *f, Nbnxm::AtomLocality locality);
private:
//! The non-bonded setup, also affects the pairlist construction kernel
Nbnxm::KernelSetup kernelSetup_;
+ //! \brief Pointer to wallcycle structure.
+ gmx_wallcycle *wcycle_;
public:
//! GPU Nbnxm data, only used with a physical GPU (TODO: use unique_ptr)
gmx_nbnxn_gpu_t *gpu_nbv;
const gmx_hw_info_t &hardwareInfo,
const gmx_device_info_t *deviceInfo,
const gmx_mtop_t *mtop,
- matrix box);
+ matrix box,
+ gmx_wallcycle *wcycle);
} // namespace Nbnxm
const gmx_hw_info_t &hardwareInfo,
const gmx_device_info_t *deviceInfo,
const gmx_mtop_t *mtop,
- matrix box)
+ matrix box,
+ gmx_wallcycle *wcycle)
{
const bool emulateGpu = (getenv("GMX_EMULATE_GPU") != nullptr);
const bool useGpu = deviceInfo != nullptr;
std::move(pairSearch),
std::move(nbat),
kernelSetup,
- gpu_nbv);
+ gpu_nbv,
+ wcycle);
}
} // namespace Nbnxm
std::unique_ptr<PairSearch> pairSearch,
std::unique_ptr<nbnxn_atomdata_t> nbat_in,
const Nbnxm::KernelSetup &kernelSetup,
- gmx_nbnxn_gpu_t *gpu_nbv_ptr) :
+ gmx_nbnxn_gpu_t *gpu_nbv_ptr,
+ gmx_wallcycle *wcycle) :
pairlistSets_(std::move(pairlistSets)),
pairSearch_(std::move(pairSearch)),
nbat(std::move(nbat_in)),
kernelSetup_(kernelSetup),
+ wcycle_(wcycle),
gpu_nbv(gpu_nbv_ptr)
{
GMX_RELEASE_ASSERT(pairlistSets_, "Need valid pairlistSets");
#include "gromacs/mdlib/gmx_omp_nthreads.h"
#include "gromacs/nbnxm/nbnxm.h"
+#include "gromacs/timing/wallcycle.h"
#include "gromacs/utility/gmxassert.h"
#include "clusterdistancekerneltype.h"
void nonbonded_verlet_t::dispatchPruneKernelGpu(int64_t step)
{
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_NONBONDED);
+
const bool stepIsEven = (pairlistSets().numStepsWithPairlist(step) % 2 == 0);
Nbnxm::gpu_launch_kernel_pruneonly(gpu_nbv,
stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
pairlistSets().params().numRollingPruningParts);
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}