NBAtomData* adat = nb->atdat;
gpu_plist* plist = nb->plist[iloc];
- cu_timers_t* t = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
/* beginning of timed HtoD section */
if (bDoTime)
{
- t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
+ timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
}
/* HtoD x, q */
if (bDoTime)
{
- t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
+ timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
}
/* When we get here all misc operations issued in the local stream as well as
NBAtomData* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
- cu_timers_t* t = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
/* beginning of timed nonbonded calculation section */
if (bDoTime)
{
- t->interaction[iloc].nb_k.openTimingRegion(deviceStream);
+ timers->interaction[iloc].nb_k.openTimingRegion(deviceStream);
}
/* Kernel launch config:
config.sharedMemorySize);
}
- auto* timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
+ auto* timingEvent = bDoTime ? timers->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
const auto kernel =
select_nbnxn_kernel(nbp->elecType,
nbp->vdwType,
if (bDoTime)
{
- t->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
+ timers->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
}
if (GMX_NATIVE_WINDOWS)
NBAtomData* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
- cu_timers_t* t = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
GpuRegionTimer* timer = nullptr;
if (bDoTime)
{
- timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
+ timer = &(plist->haveFreshList ? timers->interaction[iloc].prune_k
+ : timers->interaction[iloc].rollingPrune_k);
}
/* beginning of timed prune calculation section */
/* extract the data */
NBAtomData* adat = nb->atdat;
- cu_timers_t* t = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
bool bDoTime = nb->bDoTime;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
/* beginning of timed D2H section */
if (bDoTime)
{
- t->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
+ timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
}
/* With DD the local D2H transfer can only start after the non-local
if (bDoTime)
{
- t->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
+ timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
}
}
nb->bUseTwoStreams = bLocalAndNonlocal;
- nb->timers = new cu_timers_t();
+ nb->timers = new Nbnxm::GpuTimers();
snew(nb->timings, 1);
/* init nbst */
int nalloc, natoms;
bool realloced;
bool bDoTime = nb->bDoTime;
- cu_timers_t* timers = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
NBAtomData* d_atdat = nb->atdat;
const DeviceContext& deviceContext = *nb->deviceContext_;
const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
/*! \brief cluster size = number of atoms per cluster. */
static constexpr int c_clSize = c_nbnxnGpuClusterSize;
-/** \internal
- * \brief Typedef of actual timer type.
- */
-typedef struct Nbnxm::gpu_timers_t cu_timers_t;
-
/*! \internal
* \brief Main data structure for CUDA nonbonded force calculations.
*/
/*! \brief True if event-based timing is enabled. */
bool bDoTime = false;
/*! \brief CUDA event-based timers. */
- cu_timers_t* timers = nullptr;
+ Nbnxm::GpuTimers* timers = nullptr;
/*! \brief Timing data. TODO: deprecate this and query timers for accumulated data instead */
gmx_wallclock_gpu_nbnxn_t* timings = nullptr;
};
* - 1st pass prune: ran during the current step (prior to the force kernel);
* - rolling prune: ran at the end of the previous step (prior to the current step H2D xq);
*
- * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
- * after calling this function.
+ * Note that the resetting of Nbnxm::GpuTimers::didPrune and Nbnxm::GpuTimers::didRollingPrune
+ * should happen after calling this function.
*
* \param[in] timers structs with GPU timer objects
* \param[inout] timings GPU task timing data
* \param[in] iloc interaction locality
*/
-template<typename GpuTimers>
-static void countPruneKernelTime(GpuTimers* timers,
+static void countPruneKernelTime(Nbnxm::GpuTimers* timers,
gmx_wallclock_gpu_nbnxn_t* timings,
const InteractionLocality iloc)
{
- gpu_timers_t::Interaction& iTimers = timers->interaction[iloc];
+ GpuTimers::Interaction& iTimers = timers->interaction[iloc];
// We might have not done any pruning (e.g. if we skipped with empty domains).
if (!iTimers.didPrune && !iTimers.didRollingPrune)
* counters could end up being inconsistent due to not being incremented
* on some of the node when this is skipped on empty local domains!
*
- * \tparam GpuTimers GPU timers type
* \tparam GpuPairlist Pair list type
* \param[out] timings Pointer to the NB GPU timings data
* \param[in] timers Pointer to GPU timers data
* \param[in] doTiming True if timing is enabled.
*
*/
-template<typename GpuTimers, typename GpuPairlist>
+template<typename GpuPairlist>
static inline void gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t* timings,
- GpuTimers* timers,
+ Nbnxm::GpuTimers* timers,
const GpuPairlist* plist,
AtomLocality atomLocality,
const gmx::StepWorkload& stepWork,
* The two-sized arrays hold the local and non-local values and should always
* be indexed with eintLocal/eintNonlocal.
*/
-struct gpu_timers_t
+struct GpuTimers
{
/*! \internal
* \brief Timers for local or non-local coordinate/force transfers
//! timers for coordinate/force transfers (every step)
gmx::EnumerationArray<AtomLocality, XFTransfers> xf;
//! timers for interaction related transfers
- gmx::EnumerationArray<InteractionLocality, Nbnxm::gpu_timers_t::Interaction> interaction;
+ gmx::EnumerationArray<InteractionLocality, Nbnxm::GpuTimers::Interaction> interaction;
};
/*! \internal
}
}
- gpu_timers_t::Interaction& iTimers = nb->timers->interaction[iloc];
+ GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
if (bDoTime)
{
NBAtomData* adat = nb->atdat;
gpu_plist* plist = nb->plist[iloc];
- cl_timers_t* t = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
/* beginning of timed HtoD section */
if (bDoTime)
{
- t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
+ timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
}
/* HtoD x, q */
atomsRange.size(),
deviceStream,
GpuApiCallBehavior::Async,
- bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
+ bDoTime ? timers->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
if (bDoTime)
{
- t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
+ timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
}
/* When we get here all misc operations issued in the local stream as well as
NBAtomData* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
- cl_timers_t* t = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
/* beginning of timed nonbonded calculation section */
if (bDoTime)
{
- t->interaction[iloc].nb_k.openTimingRegion(deviceStream);
+ timers->interaction[iloc].nb_k.openTimingRegion(deviceStream);
}
/* kernel launch config */
fillin_ocl_structures(nbp, &nbparams_params);
- auto* timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
+ auto* timingEvent = bDoTime ? timers->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
constexpr char kernelName[] = "k_calc_nb";
const auto kernel =
select_nbnxn_kernel(nb,
if (bDoTime)
{
- t->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
+ timers->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
}
}
NBAtomData* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
- cl_timers_t* t = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
GpuRegionTimer* timer = nullptr;
if (bDoTime)
{
- timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
+ timer = &(plist->haveFreshList ? timers->interaction[iloc].prune_k
+ : timers->interaction[iloc].rollingPrune_k);
}
/* beginning of timed prune calculation section */
"beginning of the copy back function.");
NBAtomData* adat = nb->atdat;
- cl_timers_t* t = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
bool bDoTime = nb->bDoTime;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
/* beginning of timed D2H section */
if (bDoTime)
{
- t->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
+ timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
}
/* With DD the local D2H transfer can only start after the non-local
atomsRange.size(),
deviceStream,
GpuApiCallBehavior::Async,
- bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+ bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
/* kick off work */
cl_error = clFlush(deviceStream.stream());
SHIFTS,
deviceStream,
GpuApiCallBehavior::Async,
- bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+ bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
}
/* DtoH energies */
1,
deviceStream,
GpuApiCallBehavior::Async,
- bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+ bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
"Sizes of host- and device-side electrostatic energy terms should be the "
"same.");
1,
deviceStream,
GpuApiCallBehavior::Async,
- bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+ bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
}
}
if (bDoTime)
{
- t->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
+ timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
}
}
nb->bUseTwoStreams = bLocalAndNonlocal;
- nb->timers = new cl_timers_t();
+ nb->timers = new Nbnxm::GpuTimers();
snew(nb->timings, 1);
/* set device info, just point it to the right GPU among the detected ones */
int nalloc, natoms;
bool realloced;
bool bDoTime = nb->bDoTime;
- cl_timers_t* timers = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
NBAtomData* d_atdat = nb->atdat;
const DeviceContext& deviceContext = *nb->deviceContext_;
const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
} cl_nbparam_params_t;
-/** \internal
- * \brief Typedef of actual timer type.
- */
-typedef struct Nbnxm::gpu_timers_t cl_timers_t;
-
/*! \internal
* \brief Main data structure for OpenCL nonbonded force calculations.
*/
//! True if event-based timing is enabled.
bool bDoTime = false;
//! OpenCL event-based timers.
- cl_timers_t* timers = nullptr;
+ Nbnxm::GpuTimers* timers = nullptr;
//! Timing data. TODO: deprecate this and query timers for accumulated data instead
gmx_wallclock_gpu_nbnxn_t* timings = nullptr;
};
/*! \brief True if event-based timing is enabled. Always false for SYCL. */
bool bDoTime = false;
/*! \brief Dummy timers. */
- Nbnxm::gpu_timers_t* timers = nullptr;
+ Nbnxm::GpuTimers* timers = nullptr;
/*! \brief Dummy timing data. */
gmx_wallclock_gpu_nbnxn_t* timings = nullptr;