Launch overheads are counted in the main GPU launch overhead counter and
a separate subcounter is used for the launch and a main counter for the
CPU blocking wait timing.
Note that this chnge introduces mdtypes->timing->mdtypes cyclic dependency,
the warning on which is suppressed.
Fixes #3207
Change-Id: I3b69df9e4888800b43712a42b863958db80f5caa
# modular simulator uses shellfc from mdrun, but is later included in mdrun by simulator builder
modularsimulator -> mdrun
# modular simulator uses shellfc from mdrun, but is later included in mdrun by simulator builder
modularsimulator -> mdrun
+
+# Cycle counters in timing use comrec for the set up, which is in the mdtypes. This introduces
+# cyclic dependencies if the cycle counting is used anywhere in mdtypes.
+timing -> mdtypes
\ No newline at end of file
// TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here.
// This should be made safer.
stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
// TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here.
// This should be made safer.
stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
- commandStream, deviceContext, GpuApiCallBehavior::Async, paddingSize);
+ commandStream, deviceContext, GpuApiCallBehavior::Async, paddingSize, wcycle);
// restrict one from using other constructor here.
return std::make_unique<StatePropagatorDataGpu>(
pme_gpu_get_device_stream(&pme), pme_gpu_get_device_context(&pme),
// restrict one from using other constructor here.
return std::make_unique<StatePropagatorDataGpu>(
pme_gpu_get_device_stream(&pme), pme_gpu_get_device_context(&pme),
- GpuApiCallBehavior::Sync, pme_gpu_get_padding_size(&pme));
+ GpuApiCallBehavior::Sync, pme_gpu_get_padding_size(&pme), nullptr);
}
//! PME initialization with atom data
}
//! PME initialization with atom data
: GpuApiCallBehavior::Sync;
stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
: GpuApiCallBehavior::Sync;
stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
- pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize);
+ pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize, wcycle);
fr->stateGpu = stateGpu.get();
}
fr->stateGpu = stateGpu.get();
}
#include "locality.h"
class GpuEventSynchronizer;
#include "locality.h"
class GpuEventSynchronizer;
* \param[in] deviceContext Device context, nullptr allowed.
* \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
* \param[in] paddingSize Padding size for coordinates buffer.
* \param[in] deviceContext Device context, nullptr allowed.
* \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
* \param[in] paddingSize Padding size for coordinates buffer.
+ * \param[in] wcycle Wall cycle counter data.
*/
StatePropagatorDataGpu(const void* pmeStream,
const void* localStream,
const void* nonLocalStream,
const void* deviceContext,
GpuApiCallBehavior transferKind,
*/
StatePropagatorDataGpu(const void* pmeStream,
const void* localStream,
const void* nonLocalStream,
const void* deviceContext,
GpuApiCallBehavior transferKind,
+ int paddingSize,
+ gmx_wallcycle* wcycle);
/*! \brief Constructor to use in PME-only rank and in tests.
*
/*! \brief Constructor to use in PME-only rank and in tests.
*
* \param[in] deviceContext Device context, nullptr allowed for non-OpenCL builds.
* \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
* \param[in] paddingSize Padding size for coordinates buffer.
* \param[in] deviceContext Device context, nullptr allowed for non-OpenCL builds.
* \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
* \param[in] paddingSize Padding size for coordinates buffer.
+ * \param[in] wcycle Wall cycle counter data.
*/
StatePropagatorDataGpu(const void* pmeStream,
const void* deviceContext,
GpuApiCallBehavior transferKind,
*/
StatePropagatorDataGpu(const void* pmeStream,
const void* deviceContext,
GpuApiCallBehavior transferKind,
+ int paddingSize,
+ gmx_wallcycle* wcycle);
//! Move constructor
StatePropagatorDataGpu(StatePropagatorDataGpu&& other) noexcept;
//! Move constructor
StatePropagatorDataGpu(StatePropagatorDataGpu&& other) noexcept;
const void* /* nonLocalStream */,
const void* /* deviceContext */,
GpuApiCallBehavior /* transferKind */,
const void* /* nonLocalStream */,
const void* /* deviceContext */,
GpuApiCallBehavior /* transferKind */,
- int /* paddingSize */) :
+ int /* paddingSize */,
+ gmx_wallcycle* /* wcycle */) :
StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream */,
const void* /* deviceContext */,
GpuApiCallBehavior /* transferKind */,
StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream */,
const void* /* deviceContext */,
GpuApiCallBehavior /* transferKind */,
- int /* paddingSize */) :
+ int /* paddingSize */,
+ gmx_wallcycle* /* wcycle */) :
#include "gromacs/utility/classhelpers.h"
#include "gromacs/utility/enumerationhelpers.h"
#include "gromacs/utility/classhelpers.h"
#include "gromacs/utility/enumerationhelpers.h"
* \param[in] deviceContext Device context, nullptr allowed.
* \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
* \param[in] paddingSize Padding size for coordinates buffer.
* \param[in] deviceContext Device context, nullptr allowed.
* \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
* \param[in] paddingSize Padding size for coordinates buffer.
+ * \param[in] wcycle Wall cycle counter data.
*/
Impl(const void* pmeStream,
const void* localStream,
const void* nonLocalStream,
const void* deviceContext,
GpuApiCallBehavior transferKind,
*/
Impl(const void* pmeStream,
const void* localStream,
const void* nonLocalStream,
const void* deviceContext,
GpuApiCallBehavior transferKind,
+ int paddingSize,
+ gmx_wallcycle* wcycle);
/*! \brief Constructor to use in PME-only rank and in tests.
*
/*! \brief Constructor to use in PME-only rank and in tests.
*
* \param[in] deviceContext Device context, nullptr allowed for non-OpenCL builds.
* \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
* \param[in] paddingSize Padding size for coordinates buffer.
* \param[in] deviceContext Device context, nullptr allowed for non-OpenCL builds.
* \param[in] transferKind H2D/D2H transfer call behavior (synchronous or not).
* \param[in] paddingSize Padding size for coordinates buffer.
+ * \param[in] wcycle Wall cycle counter data.
- Impl(const void* pmeStream, const void* deviceContext, GpuApiCallBehavior transferKind, int paddingSize);
+ Impl(const void* pmeStream,
+ const void* deviceContext,
+ GpuApiCallBehavior transferKind,
+ int paddingSize,
+ gmx_wallcycle* wcycle);
//! Allocation size for the force buffer
int d_fCapacity_ = -1;
//! Allocation size for the force buffer
int d_fCapacity_ = -1;
+ //! \brief Pointer to wallcycle structure.
+ gmx_wallcycle* wcycle_;
+
/*! \brief Performs the copy of data from host to device buffer.
*
* \todo Template on locality.
/*! \brief Performs the copy of data from host to device buffer.
*
* \todo Template on locality.
# endif
# include "gromacs/math/vectypes.h"
# include "gromacs/mdtypes/state_propagator_data_gpu.h"
# endif
# include "gromacs/math/vectypes.h"
# include "gromacs/mdtypes/state_propagator_data_gpu.h"
+# include "gromacs/timing/wallcycle.h"
# include "gromacs/utility/classhelpers.h"
# include "state_propagator_data_gpu_impl.h"
# include "gromacs/utility/classhelpers.h"
# include "state_propagator_data_gpu_impl.h"
const void* nonLocalStream,
const void* deviceContext,
GpuApiCallBehavior transferKind,
const void* nonLocalStream,
const void* deviceContext,
GpuApiCallBehavior transferKind,
+ int paddingSize,
+ gmx_wallcycle* wcycle) :
transferKind_(transferKind),
transferKind_(transferKind),
- paddingSize_(paddingSize)
+ paddingSize_(paddingSize),
+ wcycle_(wcycle)
{
static_assert(GMX_GPU != GMX_GPU_NONE,
"This object should only be constructed on the GPU code-paths.");
{
static_assert(GMX_GPU != GMX_GPU_NONE,
"This object should only be constructed on the GPU code-paths.");
StatePropagatorDataGpu::Impl::Impl(const void* pmeStream,
const void* deviceContext,
GpuApiCallBehavior transferKind,
StatePropagatorDataGpu::Impl::Impl(const void* pmeStream,
const void* deviceContext,
GpuApiCallBehavior transferKind,
+ int paddingSize,
+ gmx_wallcycle* wcycle) :
transferKind_(transferKind),
transferKind_(transferKind),
- paddingSize_(paddingSize)
+ paddingSize_(paddingSize),
+ wcycle_(wcycle)
{
static_assert(GMX_GPU != GMX_GPU_NONE,
"This object should only be constructed on the GPU code-paths.");
{
static_assert(GMX_GPU != GMX_GPU_NONE,
"This object should only be constructed on the GPU code-paths.");
void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
{
void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
{
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+
numAtomsLocal_ = numAtomsLocal;
numAtomsAll_ = numAtomsAll;
numAtomsLocal_ = numAtomsLocal;
numAtomsAll_ = numAtomsAll;
{
clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, localStream_);
}
{
clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, localStream_);
}
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
}
std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
AtomLocality atomLocality,
CommandStream commandStream)
{
AtomLocality atomLocality,
CommandStream commandStream)
{
GMX_UNUSED_VALUE(dataSize);
GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
GMX_UNUSED_VALUE(dataSize);
GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+
int atomsStartAt, numAtomsToCopy;
std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
int atomsStartAt, numAtomsToCopy;
std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
copyToDeviceBuffer(&d_data, reinterpret_cast<const float*>(&h_data.data()[atomsStartAt]),
elementsStartAt, numElementsToCopy, commandStream, transferKind_, nullptr);
}
copyToDeviceBuffer(&d_data, reinterpret_cast<const float*>(&h_data.data()[atomsStartAt]),
elementsStartAt, numElementsToCopy, commandStream, transferKind_, nullptr);
}
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
}
void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_data,
AtomLocality atomLocality,
CommandStream commandStream)
{
AtomLocality atomLocality,
CommandStream commandStream)
{
GMX_UNUSED_VALUE(dataSize);
GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
GMX_UNUSED_VALUE(dataSize);
GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+
int atomsStartAt, numAtomsToCopy;
std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
int atomsStartAt, numAtomsToCopy;
std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
copyFromDeviceBuffer(reinterpret_cast<float*>(&h_data.data()[atomsStartAt]), &d_data,
elementsStartAt, numElementsToCopy, commandStream, transferKind_, nullptr);
}
copyFromDeviceBuffer(reinterpret_cast<float*>(&h_data.data()[atomsStartAt]), &d_data,
elementsStartAt, numElementsToCopy, commandStream, transferKind_, nullptr);
}
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
DeviceBuffer<float> StatePropagatorDataGpu::Impl::getCoordinates()
}
DeviceBuffer<float> StatePropagatorDataGpu::Impl::getCoordinates()
GMX_ASSERT(commandStream != nullptr,
"No stream is valid for copying positions with given atom locality.");
GMX_ASSERT(commandStream != nullptr,
"No stream is valid for copying positions with given atom locality.");
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+
copyToDevice(d_x_, h_x, d_xSize_, atomLocality, commandStream);
// markEvent is skipped in OpenCL as:
copyToDevice(d_x_, h_x, d_xSize_, atomLocality, commandStream);
// markEvent is skipped in OpenCL as:
{
xReadyOnDevice_[atomLocality].markEvent(commandStream);
}
{
xReadyOnDevice_[atomLocality].markEvent(commandStream);
}
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
void StatePropagatorDataGpu::Impl::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
{
void StatePropagatorDataGpu::Impl::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
{
+ wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
xReadyOnDevice_[atomLocality].waitForEvent();
GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
xReadyOnDevice_[atomLocality].waitForEvent();
+ wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
}
GpuEventSynchronizer* StatePropagatorDataGpu::Impl::xUpdatedOnDevice()
}
GpuEventSynchronizer* StatePropagatorDataGpu::Impl::xUpdatedOnDevice()
GMX_ASSERT(commandStream != nullptr,
"No stream is valid for copying positions with given atom locality.");
GMX_ASSERT(commandStream != nullptr,
"No stream is valid for copying positions with given atom locality.");
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+
copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, commandStream);
// Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed.
xReadyOnHost_[atomLocality].markEvent(commandStream);
copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, commandStream);
// Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed.
xReadyOnHost_[atomLocality].markEvent(commandStream);
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
{
}
void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
{
+ wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
xReadyOnHost_[atomLocality].waitForEvent();
xReadyOnHost_[atomLocality].waitForEvent();
+ wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
GMX_ASSERT(commandStream != nullptr,
"No stream is valid for copying velocities with given atom locality.");
GMX_ASSERT(commandStream != nullptr,
"No stream is valid for copying velocities with given atom locality.");
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+
copyToDevice(d_v_, h_v, d_vSize_, atomLocality, commandStream);
vReadyOnDevice_[atomLocality].markEvent(commandStream);
copyToDevice(d_v_, h_v, d_vSize_, atomLocality, commandStream);
vReadyOnDevice_[atomLocality].markEvent(commandStream);
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getVelocitiesReadyOnDeviceEvent(AtomLocality atomLocality)
}
GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getVelocitiesReadyOnDeviceEvent(AtomLocality atomLocality)
GMX_ASSERT(commandStream != nullptr,
"No stream is valid for copying velocities with given atom locality.");
GMX_ASSERT(commandStream != nullptr,
"No stream is valid for copying velocities with given atom locality.");
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+
copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, commandStream);
vReadyOnHost_[atomLocality].markEvent(commandStream);
copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, commandStream);
vReadyOnHost_[atomLocality].markEvent(commandStream);
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
void StatePropagatorDataGpu::Impl::waitVelocitiesReadyOnHost(AtomLocality atomLocality)
{
}
void StatePropagatorDataGpu::Impl::waitVelocitiesReadyOnHost(AtomLocality atomLocality)
{
+ wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
vReadyOnHost_[atomLocality].waitForEvent();
vReadyOnHost_[atomLocality].waitForEvent();
+ wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
GMX_ASSERT(commandStream != nullptr,
"No stream is valid for copying forces with given atom locality.");
GMX_ASSERT(commandStream != nullptr,
"No stream is valid for copying forces with given atom locality.");
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+
copyToDevice(d_f_, h_f, d_fSize_, atomLocality, commandStream);
fReadyOnDevice_[atomLocality].markEvent(commandStream);
copyToDevice(d_f_, h_f, d_fSize_, atomLocality, commandStream);
fReadyOnDevice_[atomLocality].markEvent(commandStream);
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
}
GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
GMX_ASSERT(commandStream != nullptr,
"No stream is valid for copying forces with given atom locality.");
GMX_ASSERT(commandStream != nullptr,
"No stream is valid for copying forces with given atom locality.");
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+
copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, commandStream);
fReadyOnHost_[atomLocality].markEvent(commandStream);
copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, commandStream);
fReadyOnHost_[atomLocality].markEvent(commandStream);
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
void StatePropagatorDataGpu::Impl::waitForcesReadyOnHost(AtomLocality atomLocality)
{
}
void StatePropagatorDataGpu::Impl::waitForcesReadyOnHost(AtomLocality atomLocality)
{
+ wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
fReadyOnHost_[atomLocality].waitForEvent();
fReadyOnHost_[atomLocality].waitForEvent();
+ wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
}
void* StatePropagatorDataGpu::Impl::getUpdateStream()
}
void* StatePropagatorDataGpu::Impl::getUpdateStream()
const void* nonLocalStream,
const void* deviceContext,
GpuApiCallBehavior transferKind,
const void* nonLocalStream,
const void* deviceContext,
GpuApiCallBehavior transferKind,
- int paddingSize) :
- impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize))
+ int paddingSize,
+ gmx_wallcycle* wcycle) :
+ impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize, wcycle))
{
}
StatePropagatorDataGpu::StatePropagatorDataGpu(const void* pmeStream,
const void* deviceContext,
GpuApiCallBehavior transferKind,
{
}
StatePropagatorDataGpu::StatePropagatorDataGpu(const void* pmeStream,
const void* deviceContext,
GpuApiCallBehavior transferKind,
- int paddingSize) :
- impl_(new Impl(pmeStream, deviceContext, transferKind, paddingSize))
+ int paddingSize,
+ gmx_wallcycle* wcycle) :
+ impl_(new Impl(pmeStream, deviceContext, transferKind, paddingSize, wcycle))
"Reduce GPU PME F",
"Wait GPU NB nonloc.",
"Wait GPU NB local",
"Reduce GPU PME F",
"Wait GPU NB nonloc.",
"Wait GPU NB local",
"NB X/F buffer ops.",
"Vsite spread",
"COM pull force",
"NB X/F buffer ops.",
"Vsite spread",
"COM pull force",
"Launch NB GPU tasks",
"Launch Bonded GPU tasks",
"Launch PME GPU tasks",
"Launch NB GPU tasks",
"Launch Bonded GPU tasks",
"Launch PME GPU tasks",
"Ewald F correction",
"NB X buffer ops.",
"NB F buffer ops.",
"Ewald F correction",
"NB X buffer ops.",
"NB F buffer ops.",
ewcPME_GPU_F_REDUCTION,
ewcWAIT_GPU_NB_NL,
ewcWAIT_GPU_NB_L,
ewcPME_GPU_F_REDUCTION,
ewcWAIT_GPU_NB_NL,
ewcWAIT_GPU_NB_L,
+ ewcWAIT_GPU_STATE_PROPAGATOR_DATA,
ewcNB_XF_BUF_OPS,
ewcVSITESPREAD,
ewcPULLPOT,
ewcNB_XF_BUF_OPS,
ewcVSITESPREAD,
ewcPULLPOT,
ewcsLAUNCH_GPU_NONBONDED,
ewcsLAUNCH_GPU_BONDED,
ewcsLAUNCH_GPU_PME,
ewcsLAUNCH_GPU_NONBONDED,
ewcsLAUNCH_GPU_BONDED,
ewcsLAUNCH_GPU_PME,
+ ewcsLAUNCH_STATE_PROPAGATOR_DATA,
ewcsEWALD_CORRECTION,
ewcsNB_X_BUF_OPS,
ewcsNB_F_BUF_OPS,
ewcsEWALD_CORRECTION,
ewcsNB_X_BUF_OPS,
ewcsNB_F_BUF_OPS,