The GpuTimers are used through proxy objects in OpenCL and CUDA
versions of NBNXM, which complicates the unification of the code.
This change eliminates the proxy object by using the underlying
object directly.
Refs #2608
NBAtomData* adat = nb->atdat;
gpu_plist* plist = nb->plist[iloc];
NBAtomData* adat = nb->atdat;
gpu_plist* plist = nb->plist[iloc];
- cu_timers_t* t = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
/* beginning of timed HtoD section */
if (bDoTime)
{
/* beginning of timed HtoD section */
if (bDoTime)
{
- t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
+ timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
- t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
+ timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
}
/* When we get here all misc operations issued in the local stream as well as
}
/* When we get here all misc operations issued in the local stream as well as
NBAtomData* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
NBAtomData* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
- cu_timers_t* t = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
/* beginning of timed nonbonded calculation section */
if (bDoTime)
{
/* beginning of timed nonbonded calculation section */
if (bDoTime)
{
- t->interaction[iloc].nb_k.openTimingRegion(deviceStream);
+ timers->interaction[iloc].nb_k.openTimingRegion(deviceStream);
}
/* Kernel launch config:
}
/* Kernel launch config:
config.sharedMemorySize);
}
config.sharedMemorySize);
}
- auto* timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
+ auto* timingEvent = bDoTime ? timers->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
const auto kernel =
select_nbnxn_kernel(nbp->elecType,
nbp->vdwType,
const auto kernel =
select_nbnxn_kernel(nbp->elecType,
nbp->vdwType,
- t->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
+ timers->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
}
if (GMX_NATIVE_WINDOWS)
}
if (GMX_NATIVE_WINDOWS)
NBAtomData* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
NBAtomData* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
- cu_timers_t* t = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
GpuRegionTimer* timer = nullptr;
if (bDoTime)
{
GpuRegionTimer* timer = nullptr;
if (bDoTime)
{
- timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
+ timer = &(plist->haveFreshList ? timers->interaction[iloc].prune_k
+ : timers->interaction[iloc].rollingPrune_k);
}
/* beginning of timed prune calculation section */
}
/* beginning of timed prune calculation section */
/* extract the data */
NBAtomData* adat = nb->atdat;
/* extract the data */
NBAtomData* adat = nb->atdat;
- cu_timers_t* t = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
bool bDoTime = nb->bDoTime;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
/* beginning of timed D2H section */
if (bDoTime)
{
/* beginning of timed D2H section */
if (bDoTime)
{
- t->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
+ timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
}
/* With DD the local D2H transfer can only start after the non-local
}
/* With DD the local D2H transfer can only start after the non-local
- t->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
+ timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
nb->bUseTwoStreams = bLocalAndNonlocal;
nb->bUseTwoStreams = bLocalAndNonlocal;
- nb->timers = new cu_timers_t();
+ nb->timers = new Nbnxm::GpuTimers();
snew(nb->timings, 1);
/* init nbst */
snew(nb->timings, 1);
/* init nbst */
int nalloc, natoms;
bool realloced;
bool bDoTime = nb->bDoTime;
int nalloc, natoms;
bool realloced;
bool bDoTime = nb->bDoTime;
- cu_timers_t* timers = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
NBAtomData* d_atdat = nb->atdat;
const DeviceContext& deviceContext = *nb->deviceContext_;
const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
NBAtomData* d_atdat = nb->atdat;
const DeviceContext& deviceContext = *nb->deviceContext_;
const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
/*! \brief cluster size = number of atoms per cluster. */
static constexpr int c_clSize = c_nbnxnGpuClusterSize;
/*! \brief cluster size = number of atoms per cluster. */
static constexpr int c_clSize = c_nbnxnGpuClusterSize;
-/** \internal
- * \brief Typedef of actual timer type.
- */
-typedef struct Nbnxm::gpu_timers_t cu_timers_t;
-
/*! \internal
* \brief Main data structure for CUDA nonbonded force calculations.
*/
/*! \internal
* \brief Main data structure for CUDA nonbonded force calculations.
*/
/*! \brief True if event-based timing is enabled. */
bool bDoTime = false;
/*! \brief CUDA event-based timers. */
/*! \brief True if event-based timing is enabled. */
bool bDoTime = false;
/*! \brief CUDA event-based timers. */
- cu_timers_t* timers = nullptr;
+ Nbnxm::GpuTimers* timers = nullptr;
/*! \brief Timing data. TODO: deprecate this and query timers for accumulated data instead */
gmx_wallclock_gpu_nbnxn_t* timings = nullptr;
};
/*! \brief Timing data. TODO: deprecate this and query timers for accumulated data instead */
gmx_wallclock_gpu_nbnxn_t* timings = nullptr;
};
* - 1st pass prune: ran during the current step (prior to the force kernel);
* - rolling prune: ran at the end of the previous step (prior to the current step H2D xq);
*
* - 1st pass prune: ran during the current step (prior to the force kernel);
* - rolling prune: ran at the end of the previous step (prior to the current step H2D xq);
*
- * Note that the resetting of cu_timers_t::didPrune and cu_timers_t::didRollingPrune should happen
- * after calling this function.
+ * Note that the resetting of Nbnxm::GpuTimers::didPrune and Nbnxm::GpuTimers::didRollingPrune
+ * should happen after calling this function.
*
* \param[in] timers structs with GPU timer objects
* \param[inout] timings GPU task timing data
* \param[in] iloc interaction locality
*/
*
* \param[in] timers structs with GPU timer objects
* \param[inout] timings GPU task timing data
* \param[in] iloc interaction locality
*/
-template<typename GpuTimers>
-static void countPruneKernelTime(GpuTimers* timers,
+static void countPruneKernelTime(Nbnxm::GpuTimers* timers,
gmx_wallclock_gpu_nbnxn_t* timings,
const InteractionLocality iloc)
{
gmx_wallclock_gpu_nbnxn_t* timings,
const InteractionLocality iloc)
{
- gpu_timers_t::Interaction& iTimers = timers->interaction[iloc];
+ GpuTimers::Interaction& iTimers = timers->interaction[iloc];
// We might have not done any pruning (e.g. if we skipped with empty domains).
if (!iTimers.didPrune && !iTimers.didRollingPrune)
// We might have not done any pruning (e.g. if we skipped with empty domains).
if (!iTimers.didPrune && !iTimers.didRollingPrune)
* counters could end up being inconsistent due to not being incremented
* on some of the node when this is skipped on empty local domains!
*
* counters could end up being inconsistent due to not being incremented
* on some of the node when this is skipped on empty local domains!
*
- * \tparam GpuTimers GPU timers type
* \tparam GpuPairlist Pair list type
* \param[out] timings Pointer to the NB GPU timings data
* \param[in] timers Pointer to GPU timers data
* \tparam GpuPairlist Pair list type
* \param[out] timings Pointer to the NB GPU timings data
* \param[in] timers Pointer to GPU timers data
* \param[in] doTiming True if timing is enabled.
*
*/
* \param[in] doTiming True if timing is enabled.
*
*/
-template<typename GpuTimers, typename GpuPairlist>
+template<typename GpuPairlist>
static inline void gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t* timings,
static inline void gpu_accumulate_timings(gmx_wallclock_gpu_nbnxn_t* timings,
+ Nbnxm::GpuTimers* timers,
const GpuPairlist* plist,
AtomLocality atomLocality,
const gmx::StepWorkload& stepWork,
const GpuPairlist* plist,
AtomLocality atomLocality,
const gmx::StepWorkload& stepWork,
* The two-sized arrays hold the local and non-local values and should always
* be indexed with eintLocal/eintNonlocal.
*/
* The two-sized arrays hold the local and non-local values and should always
* be indexed with eintLocal/eintNonlocal.
*/
{
/*! \internal
* \brief Timers for local or non-local coordinate/force transfers
{
/*! \internal
* \brief Timers for local or non-local coordinate/force transfers
//! timers for coordinate/force transfers (every step)
gmx::EnumerationArray<AtomLocality, XFTransfers> xf;
//! timers for interaction related transfers
//! timers for coordinate/force transfers (every step)
gmx::EnumerationArray<AtomLocality, XFTransfers> xf;
//! timers for interaction related transfers
- gmx::EnumerationArray<InteractionLocality, Nbnxm::gpu_timers_t::Interaction> interaction;
+ gmx::EnumerationArray<InteractionLocality, Nbnxm::GpuTimers::Interaction> interaction;
- gpu_timers_t::Interaction& iTimers = nb->timers->interaction[iloc];
+ GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
NBAtomData* adat = nb->atdat;
gpu_plist* plist = nb->plist[iloc];
NBAtomData* adat = nb->atdat;
gpu_plist* plist = nb->plist[iloc];
- cl_timers_t* t = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
/* beginning of timed HtoD section */
if (bDoTime)
{
/* beginning of timed HtoD section */
if (bDoTime)
{
- t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
+ timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
atomsRange.size(),
deviceStream,
GpuApiCallBehavior::Async,
atomsRange.size(),
deviceStream,
GpuApiCallBehavior::Async,
- bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
+ bDoTime ? timers->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
- t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
+ timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
}
/* When we get here all misc operations issued in the local stream as well as
}
/* When we get here all misc operations issued in the local stream as well as
NBAtomData* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
NBAtomData* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
- cl_timers_t* t = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
/* beginning of timed nonbonded calculation section */
if (bDoTime)
{
/* beginning of timed nonbonded calculation section */
if (bDoTime)
{
- t->interaction[iloc].nb_k.openTimingRegion(deviceStream);
+ timers->interaction[iloc].nb_k.openTimingRegion(deviceStream);
}
/* kernel launch config */
}
/* kernel launch config */
fillin_ocl_structures(nbp, &nbparams_params);
fillin_ocl_structures(nbp, &nbparams_params);
- auto* timingEvent = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
+ auto* timingEvent = bDoTime ? timers->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
constexpr char kernelName[] = "k_calc_nb";
const auto kernel =
select_nbnxn_kernel(nb,
constexpr char kernelName[] = "k_calc_nb";
const auto kernel =
select_nbnxn_kernel(nb,
- t->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
+ timers->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
NBAtomData* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
NBAtomData* adat = nb->atdat;
NBParamGpu* nbp = nb->nbparam;
gpu_plist* plist = nb->plist[iloc];
- cl_timers_t* t = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
GpuRegionTimer* timer = nullptr;
if (bDoTime)
{
GpuRegionTimer* timer = nullptr;
if (bDoTime)
{
- timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
+ timer = &(plist->haveFreshList ? timers->interaction[iloc].prune_k
+ : timers->interaction[iloc].rollingPrune_k);
}
/* beginning of timed prune calculation section */
}
/* beginning of timed prune calculation section */
"beginning of the copy back function.");
NBAtomData* adat = nb->atdat;
"beginning of the copy back function.");
NBAtomData* adat = nb->atdat;
- cl_timers_t* t = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
bool bDoTime = nb->bDoTime;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
bool bDoTime = nb->bDoTime;
const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
/* beginning of timed D2H section */
if (bDoTime)
{
/* beginning of timed D2H section */
if (bDoTime)
{
- t->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
+ timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
}
/* With DD the local D2H transfer can only start after the non-local
}
/* With DD the local D2H transfer can only start after the non-local
atomsRange.size(),
deviceStream,
GpuApiCallBehavior::Async,
atomsRange.size(),
deviceStream,
GpuApiCallBehavior::Async,
- bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+ bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
/* kick off work */
cl_error = clFlush(deviceStream.stream());
/* kick off work */
cl_error = clFlush(deviceStream.stream());
SHIFTS,
deviceStream,
GpuApiCallBehavior::Async,
SHIFTS,
deviceStream,
GpuApiCallBehavior::Async,
- bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+ bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
1,
deviceStream,
GpuApiCallBehavior::Async,
1,
deviceStream,
GpuApiCallBehavior::Async,
- bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+ bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
"Sizes of host- and device-side electrostatic energy terms should be the "
"same.");
static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
"Sizes of host- and device-side electrostatic energy terms should be the "
"same.");
1,
deviceStream,
GpuApiCallBehavior::Async,
1,
deviceStream,
GpuApiCallBehavior::Async,
- bDoTime ? t->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+ bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
- t->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
+ timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
nb->bUseTwoStreams = bLocalAndNonlocal;
nb->bUseTwoStreams = bLocalAndNonlocal;
- nb->timers = new cl_timers_t();
+ nb->timers = new Nbnxm::GpuTimers();
snew(nb->timings, 1);
/* set device info, just point it to the right GPU among the detected ones */
snew(nb->timings, 1);
/* set device info, just point it to the right GPU among the detected ones */
int nalloc, natoms;
bool realloced;
bool bDoTime = nb->bDoTime;
int nalloc, natoms;
bool realloced;
bool bDoTime = nb->bDoTime;
- cl_timers_t* timers = nb->timers;
+ Nbnxm::GpuTimers* timers = nb->timers;
NBAtomData* d_atdat = nb->atdat;
const DeviceContext& deviceContext = *nb->deviceContext_;
const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
NBAtomData* d_atdat = nb->atdat;
const DeviceContext& deviceContext = *nb->deviceContext_;
const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
-/** \internal
- * \brief Typedef of actual timer type.
- */
-typedef struct Nbnxm::gpu_timers_t cl_timers_t;
-
/*! \internal
* \brief Main data structure for OpenCL nonbonded force calculations.
*/
/*! \internal
* \brief Main data structure for OpenCL nonbonded force calculations.
*/
//! True if event-based timing is enabled.
bool bDoTime = false;
//! OpenCL event-based timers.
//! True if event-based timing is enabled.
bool bDoTime = false;
//! OpenCL event-based timers.
- cl_timers_t* timers = nullptr;
+ Nbnxm::GpuTimers* timers = nullptr;
//! Timing data. TODO: deprecate this and query timers for accumulated data instead
gmx_wallclock_gpu_nbnxn_t* timings = nullptr;
};
//! Timing data. TODO: deprecate this and query timers for accumulated data instead
gmx_wallclock_gpu_nbnxn_t* timings = nullptr;
};
/*! \brief True if event-based timing is enabled. Always false for SYCL. */
bool bDoTime = false;
/*! \brief Dummy timers. */
/*! \brief True if event-based timing is enabled. Always false for SYCL. */
bool bDoTime = false;
/*! \brief Dummy timers. */
- Nbnxm::gpu_timers_t* timers = nullptr;
+ Nbnxm::GpuTimers* timers = nullptr;
/*! \brief Dummy timing data. */
gmx_wallclock_gpu_nbnxn_t* timings = nullptr;
/*! \brief Dummy timing data. */
gmx_wallclock_gpu_nbnxn_t* timings = nullptr;