Removed wcycle arguments from method calls that no longer need it and
moved a few instances of cycle counting into the nbmxn module.
Change-Id: Ic5646b3bb85ed2c66137e9db7bd70822df95042b
fr->nbv = Nbnxm::init_nb_verlet(mdlog, bFEP_NonBonded, ir, fr,
cr, hardwareInfo, deviceInfo,
fr->nbv = Nbnxm::init_nb_verlet(mdlog, bFEP_NonBonded, ir, fr,
cr, hardwareInfo, deviceInfo,
- nbv->dispatchNonbondedKernel(ilocality, *ic, flags, clearF, *fr, enerd, nrnb, wcycle);
+ nbv->dispatchNonbondedKernel(ilocality, *ic, flags, clearF, *fr, enerd, nrnb);
}
static inline void clear_rvecs_omp(int n, rvec v[])
}
static inline void clear_rvecs_omp(int n, rvec v[])
nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
as_rvec_array(force->unpaddedArrayRef().data()),
BufferOpsUseGpu::False,
nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
as_rvec_array(force->unpaddedArrayRef().data()),
BufferOpsUseGpu::False,
- GpuBufferOpsAccumulateForce::Null,
- wcycle);
+ GpuBufferOpsAccumulateForce::Null);
*/
if (nbv->isDynamicPruningStepGpu(step))
{
*/
if (nbv->isDynamicPruningStepGpu(step))
{
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
- wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
nbv->dispatchPruneKernelGpu(step);
nbv->dispatchPruneKernelGpu(step);
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
/* now clear the GPU outputs while we finish the step on the CPU */
}
/* now clear the GPU outputs while we finish the step on the CPU */
// NS step is also a virial step (on which f buf ops are deactivated).
if (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA))
{
// NS step is also a virial step (on which f buf ops are deactivated).
if (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA))
{
- nbv->atomdata_init_add_nbat_f_to_f_gpu(wcycle);
+ nbv->atomdata_init_add_nbat_f_to_f_gpu();
}
}
else
{
nbv->setCoordinates(Nbnxm::AtomLocality::Local, false,
}
}
else
{
nbv->setCoordinates(Nbnxm::AtomLocality::Local, false,
- x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata), wcycle);
+ x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata));
dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
nbv->setCoordinates(Nbnxm::AtomLocality::NonLocal, false,
dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
nbv->setCoordinates(Nbnxm::AtomLocality::NonLocal, false,
- x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata), wcycle);
+ x.unpaddedArrayRef(), useGpuXBufOps, pme_gpu_get_device_x(fr->pmedata));
nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::Local,
fr, as_rvec_array(x.unpaddedArrayRef().data()), forceOut.f, *mdatoms,
inputrec->fepvals, lambda.data(),
nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::Local,
fr, as_rvec_array(x.unpaddedArrayRef().data()), forceOut.f, *mdatoms,
inputrec->fepvals, lambda.data(),
- enerd, flags, nrnb, wcycle);
if (havePPDomainDecomposition(cr))
{
nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::NonLocal,
fr, as_rvec_array(x.unpaddedArrayRef().data()), forceOut.f, *mdatoms,
inputrec->fepvals, lambda.data(),
if (havePPDomainDecomposition(cr))
{
nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::NonLocal,
fr, as_rvec_array(x.unpaddedArrayRef().data()), forceOut.f, *mdatoms,
inputrec->fepvals, lambda.data(),
- enerd, flags, nrnb, wcycle);
wallcycle_stop(wcycle, ewcFORCE);
nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f,
BufferOpsUseGpu::False,
wallcycle_stop(wcycle, ewcFORCE);
nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f,
BufferOpsUseGpu::False,
- GpuBufferOpsAccumulateForce::Null,
- wcycle);
+ GpuBufferOpsAccumulateForce::Null);
wallcycle_start_nocount(wcycle, ewcFORCE);
wallcycle_start_nocount(wcycle, ewcFORCE);
nbv->launch_copy_f_to_gpu(forceOut.f, Nbnxm::AtomLocality::NonLocal);
}
nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal,
nbv->launch_copy_f_to_gpu(forceOut.f, Nbnxm::AtomLocality::NonLocal);
}
nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal,
- forceOut.f, useGpuFBufOps, accumulateForce, wcycle);
+ forceOut.f, useGpuFBufOps, accumulateForce);
if (useGpuFBufOps == BufferOpsUseGpu::True)
{
nbv->launch_copy_f_from_gpu(forceOut.f, Nbnxm::AtomLocality::NonLocal);
if (useGpuFBufOps == BufferOpsUseGpu::True)
{
nbv->launch_copy_f_from_gpu(forceOut.f, Nbnxm::AtomLocality::NonLocal);
nbv->launch_copy_f_to_gpu(forceOut.f, Nbnxm::AtomLocality::Local);
}
nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
nbv->launch_copy_f_to_gpu(forceOut.f, Nbnxm::AtomLocality::Local);
}
nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
- forceOut.f, useGpuFBufOps, accumulateForce, wcycle);
+ forceOut.f, useGpuFBufOps, accumulateForce);
if (useGpuFBufOps == BufferOpsUseGpu::True)
{
nbv->launch_copy_f_from_gpu(forceOut.f, Nbnxm::AtomLocality::Local);
if (useGpuFBufOps == BufferOpsUseGpu::True)
{
nbv->launch_copy_f_from_gpu(forceOut.f, Nbnxm::AtomLocality::Local);
int clearF,
const t_forcerec &fr,
gmx_enerdata_t *enerd,
int clearF,
const t_forcerec &fr,
gmx_enerdata_t *enerd,
- t_nrnb *nrnb,
- gmx_wallcycle *wcycle)
{
const PairlistSet &pairlistSet = pairlistSets().pairlistSet(iLocality);
{
const PairlistSet &pairlistSet = pairlistSets().pairlistSet(iLocality);
fr.bBHAM ?
enerd->grpp.ener[egBHAMSR].data() :
enerd->grpp.ener[egLJSR].data(),
fr.bBHAM ?
enerd->grpp.ener[egBHAMSR].data() :
enerd->grpp.ener[egLJSR].data(),
break;
case Nbnxm::KernelType::Gpu8x8x8:
break;
case Nbnxm::KernelType::Gpu8x8x8:
real *lambda,
gmx_enerdata_t *enerd,
const int forceFlags,
real *lambda,
gmx_enerdata_t *enerd,
const int forceFlags,
- t_nrnb *nrnb,
- gmx_wallcycle *wcycle)
{
const auto nbl_fep = pairlistSets().pairlistSet(iLocality).fepLists();
{
const auto nbl_fep = pairlistSets().pairlistSet(iLocality).fepLists();
GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_fep.ssize(), "Number of lists should be same as number of NB threads");
GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_fep.ssize(), "Number of lists should be same as number of NB threads");
- wallcycle_sub_start(wcycle, ewcsNONBONDED_FEP);
+ wallcycle_sub_start(wcycle_, ewcsNONBONDED_FEP);
#pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize())
for (int th = 0; th < nbl_fep.ssize(); th++)
{
#pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize())
for (int th = 0; th < nbl_fep.ssize(); th++)
{
enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
}
}
enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
}
}
- wallcycle_sub_stop(wcycle, ewcsNONBONDED_FEP);
+ wallcycle_sub_stop(wcycle_, ewcsNONBONDED_FEP);
const bool fillLocal,
gmx::ArrayRef<const gmx::RVec> x,
BufferOpsUseGpu useGpu,
const bool fillLocal,
gmx::ArrayRef<const gmx::RVec> x,
BufferOpsUseGpu useGpu,
- void *xPmeDevicePtr,
- gmx_wallcycle *wcycle)
- wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
- wallcycle_sub_start(wcycle, ewcsNB_X_BUF_OPS);
+ wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
auto fnPtr = (useGpu == BufferOpsUseGpu::True) ?
nbnxn_atomdata_copy_x_to_nbat_x<true> :
auto fnPtr = (useGpu == BufferOpsUseGpu::True) ?
nbnxn_atomdata_copy_x_to_nbat_x<true> :
as_rvec_array(x.data()),
nbat.get(), gpu_nbv, xPmeDevicePtr);
as_rvec_array(x.data()),
nbat.get(), gpu_nbv, xPmeDevicePtr);
- wallcycle_sub_stop(wcycle, ewcsNB_X_BUF_OPS);
- wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
+ wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
}
gmx::ArrayRef<const int> nonbonded_verlet_t::getGridIndices() const
}
gmx::ArrayRef<const int> nonbonded_verlet_t::getGridIndices() const
nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality locality,
rvec *f,
BufferOpsUseGpu useGpu,
nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality locality,
rvec *f,
BufferOpsUseGpu useGpu,
- GpuBufferOpsAccumulateForce accumulateForce,
- gmx_wallcycle *wcycle)
+ GpuBufferOpsAccumulateForce accumulateForce)
{
GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) &&
{
GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) &&
- wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
- wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
auto fn = useGpu == BufferOpsUseGpu::True ? reduceForces<true> : reduceForces<false>;
fn(nbat.get(), locality, pairSearch_->gridSet(), f, gpu_nbv, accumulateForce);
auto fn = useGpu == BufferOpsUseGpu::True ? reduceForces<true> : reduceForces<false>;
fn(nbat.get(), locality, pairSearch_->gridSet(), f, gpu_nbv, accumulateForce);
- wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
- wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
+ wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
-nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(gmx_wallcycle *wcycle)
+nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu()
- wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
- wallcycle_sub_start(wcycle, ewcsNB_F_BUF_OPS);
+ wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
const Nbnxm::GridSet &gridSet = pairSearch_->gridSet();
const Nbnxm::GridSet &gridSet = pairSearch_->gridSet();
gpu_nbv,
gridSet.numRealAtomsTotal());
gpu_nbv,
gridSet.numRealAtomsTotal());
- wallcycle_sub_stop(wcycle, ewcsNB_F_BUF_OPS);
- wallcycle_stop(wcycle, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
+ wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
}
real nonbonded_verlet_t::pairlistInnerRadius() const
}
real nonbonded_verlet_t::pairlistInnerRadius() const
std::unique_ptr<PairSearch> pairSearch,
std::unique_ptr<nbnxn_atomdata_t> nbat,
const Nbnxm::KernelSetup &kernelSetup,
std::unique_ptr<PairSearch> pairSearch,
std::unique_ptr<nbnxn_atomdata_t> nbat,
const Nbnxm::KernelSetup &kernelSetup,
- gmx_nbnxn_gpu_t *gpu_nbv);
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ gmx_wallcycle *wcycle);
bool fillLocal,
gmx::ArrayRef<const gmx::RVec> x,
BufferOpsUseGpu useGpu,
bool fillLocal,
gmx::ArrayRef<const gmx::RVec> x,
BufferOpsUseGpu useGpu,
- void *xPmeDevicePtr,
- gmx_wallcycle *wcycle);
//! Init for GPU version of setup coordinates in Nbnxm
void atomdata_init_copy_x_to_nbat_x_gpu();
//! Init for GPU version of setup coordinates in Nbnxm
void atomdata_init_copy_x_to_nbat_x_gpu();
int clearF,
const t_forcerec &fr,
gmx_enerdata_t *enerd,
int clearF,
const t_forcerec &fr,
gmx_enerdata_t *enerd,
- t_nrnb *nrnb,
- gmx_wallcycle *wcycle);
//! Executes the non-bonded free-energy kernel, always runs on the CPU
void dispatchFreeEnergyKernel(Nbnxm::InteractionLocality iLocality,
//! Executes the non-bonded free-energy kernel, always runs on the CPU
void dispatchFreeEnergyKernel(Nbnxm::InteractionLocality iLocality,
real *lambda,
gmx_enerdata_t *enerd,
int forceFlags,
real *lambda,
gmx_enerdata_t *enerd,
int forceFlags,
- t_nrnb *nrnb,
- gmx_wallcycle *wcycle);
//! Add the forces stored in nbat to f, zeros the forces in nbat */
void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality locality,
rvec *f,
BufferOpsUseGpu useGpu,
//! Add the forces stored in nbat to f, zeros the forces in nbat */
void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality locality,
rvec *f,
BufferOpsUseGpu useGpu,
- GpuBufferOpsAccumulateForce accumulateForce,
- gmx_wallcycle *wcycle);
+ GpuBufferOpsAccumulateForce accumulateForce);
/*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
/*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
- void atomdata_init_add_nbat_f_to_f_gpu(gmx_wallcycle *wcycle);
+ void atomdata_init_add_nbat_f_to_f_gpu();
/*! \brief H2D transfer of force buffer*/
void launch_copy_f_to_gpu(rvec *f, Nbnxm::AtomLocality locality);
/*! \brief H2D transfer of force buffer*/
void launch_copy_f_to_gpu(rvec *f, Nbnxm::AtomLocality locality);
private:
//! The non-bonded setup, also affects the pairlist construction kernel
Nbnxm::KernelSetup kernelSetup_;
private:
//! The non-bonded setup, also affects the pairlist construction kernel
Nbnxm::KernelSetup kernelSetup_;
+ //! \brief Pointer to wallcycle structure.
+ gmx_wallcycle *wcycle_;
public:
//! GPU Nbnxm data, only used with a physical GPU (TODO: use unique_ptr)
gmx_nbnxn_gpu_t *gpu_nbv;
public:
//! GPU Nbnxm data, only used with a physical GPU (TODO: use unique_ptr)
gmx_nbnxn_gpu_t *gpu_nbv;
const gmx_hw_info_t &hardwareInfo,
const gmx_device_info_t *deviceInfo,
const gmx_mtop_t *mtop,
const gmx_hw_info_t &hardwareInfo,
const gmx_device_info_t *deviceInfo,
const gmx_mtop_t *mtop,
+ matrix box,
+ gmx_wallcycle *wcycle);
const gmx_hw_info_t &hardwareInfo,
const gmx_device_info_t *deviceInfo,
const gmx_mtop_t *mtop,
const gmx_hw_info_t &hardwareInfo,
const gmx_device_info_t *deviceInfo,
const gmx_mtop_t *mtop,
+ matrix box,
+ gmx_wallcycle *wcycle)
{
const bool emulateGpu = (getenv("GMX_EMULATE_GPU") != nullptr);
const bool useGpu = deviceInfo != nullptr;
{
const bool emulateGpu = (getenv("GMX_EMULATE_GPU") != nullptr);
const bool useGpu = deviceInfo != nullptr;
std::move(pairSearch),
std::move(nbat),
kernelSetup,
std::move(pairSearch),
std::move(nbat),
kernelSetup,
std::unique_ptr<PairSearch> pairSearch,
std::unique_ptr<nbnxn_atomdata_t> nbat_in,
const Nbnxm::KernelSetup &kernelSetup,
std::unique_ptr<PairSearch> pairSearch,
std::unique_ptr<nbnxn_atomdata_t> nbat_in,
const Nbnxm::KernelSetup &kernelSetup,
- gmx_nbnxn_gpu_t *gpu_nbv_ptr) :
+ gmx_nbnxn_gpu_t *gpu_nbv_ptr,
+ gmx_wallcycle *wcycle) :
pairlistSets_(std::move(pairlistSets)),
pairSearch_(std::move(pairSearch)),
nbat(std::move(nbat_in)),
kernelSetup_(kernelSetup),
pairlistSets_(std::move(pairlistSets)),
pairSearch_(std::move(pairSearch)),
nbat(std::move(nbat_in)),
kernelSetup_(kernelSetup),
gpu_nbv(gpu_nbv_ptr)
{
GMX_RELEASE_ASSERT(pairlistSets_, "Need valid pairlistSets");
gpu_nbv(gpu_nbv_ptr)
{
GMX_RELEASE_ASSERT(pairlistSets_, "Need valid pairlistSets");
#include "gromacs/mdlib/gmx_omp_nthreads.h"
#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/mdlib/gmx_omp_nthreads.h"
#include "gromacs/nbnxm/nbnxm.h"
+#include "gromacs/timing/wallcycle.h"
#include "gromacs/utility/gmxassert.h"
#include "clusterdistancekerneltype.h"
#include "gromacs/utility/gmxassert.h"
#include "clusterdistancekerneltype.h"
void nonbonded_verlet_t::dispatchPruneKernelGpu(int64_t step)
{
void nonbonded_verlet_t::dispatchPruneKernelGpu(int64_t step)
{
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_NONBONDED);
+
const bool stepIsEven = (pairlistSets().numStepsWithPairlist(step) % 2 == 0);
Nbnxm::gpu_launch_kernel_pruneonly(gpu_nbv,
stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
pairlistSets().params().numRollingPruningParts);
const bool stepIsEven = (pairlistSets().numStepsWithPairlist(step) % 2 == 0);
Nbnxm::gpu_launch_kernel_pruneonly(gpu_nbv,
stepIsEven ? Nbnxm::InteractionLocality::Local : Nbnxm::InteractionLocality::NonLocal,
pairlistSets().params().numRollingPruningParts);
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NONBONDED);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);