#include "gromacs/gpu_utils/devicebuffer_datatype.h"
#include "gromacs/mdtypes/group.h"
+#include "gromacs/timing/wallcycle.h"
#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/classhelpers.h"
/*! \brief Create Update-Constrain object.
*
* The constructor is given a non-nullptr \p deviceStream, in which all the update and constrain
- * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify
- * coordinates. The event is maintained outside this class and also passed to all (if any) consumers
- * of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the
- * markEvent(...) method is called unconditionally.
+ * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that
+ * modify coordinates. The event is maintained outside this class and also passed to all (if
+ * any) consumers of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr
+ * because the markEvent(...) method is called unconditionally.
*
* \param[in] ir Input record data: LINCS takes number of iterations and order of
* projection from it.
* and target O-H and H-H distances from this object.
* \param[in] deviceContext GPU device context.
* \param[in] deviceStream GPU stream to use.
- * \param[in] xUpdatedOnDevice The event synchronizer to use to mark that update is done on the GPU.
+ * \param[in] xUpdatedOnDevice The event synchronizer to use to mark that update is done
+ * on the GPU.
+ * \param[in] wcycle The wallclock counter
*/
UpdateConstrainGpu(const t_inputrec& ir,
const gmx_mtop_t& mtop,
const DeviceContext& deviceContext,
const DeviceStream& deviceStream,
- GpuEventSynchronizer* xUpdatedOnDevice);
+ GpuEventSynchronizer* xUpdatedOnDevice,
+ gmx_wallcycle* wcycle);
~UpdateConstrainGpu();
const gmx_mtop_t& /* mtop */,
const DeviceContext& /* deviceContext */,
const DeviceStream& /* deviceStream */,
- GpuEventSynchronizer* /* xUpdatedOnDevice */) :
+ GpuEventSynchronizer* /* xUpdatedOnDevice */,
+ gmx_wallcycle* /*wcycle*/) :
impl_(nullptr)
{
GMX_ASSERT(!impl_,
#include "gromacs/mdlib/settle_gpu.cuh"
#include "gromacs/mdlib/update_constrain_gpu.h"
#include "gromacs/mdtypes/mdatom.h"
+#include "gromacs/timing/wallcycle.h"
namespace gmx
{
const float dtPressureCouple,
const matrix prVelocityScalingMatrix)
{
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+
// Clearing virial matrix
// TODO There is no point in having separate virial matrix for constraints
clear_mat(virial);
coordinatesReady_->markEvent(deviceStream_);
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+
return;
}
void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix)
{
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+
ScalingMatrix mu;
mu.xx = scalingMatrix[XX][XX];
mu.yy = scalingMatrix[YY][YY];
// TODO: Although this only happens on the pressure coupling steps, this synchronization
// can affect the performance if nstpcouple is small.
deviceStream_.synchronize();
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
void UpdateConstrainGpu::Impl::scaleVelocities(const matrix scalingMatrix)
{
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+
ScalingMatrix mu;
mu.xx = scalingMatrix[XX][XX];
mu.yy = scalingMatrix[YY][YY];
// TODO: Although this only happens on the pressure coupling steps, this synchronization
// can affect the performance if nstpcouple is small.
deviceStream_.synchronize();
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
UpdateConstrainGpu::Impl::Impl(const t_inputrec& ir,
const gmx_mtop_t& mtop,
const DeviceContext& deviceContext,
const DeviceStream& deviceStream,
- GpuEventSynchronizer* xUpdatedOnDevice) :
+ GpuEventSynchronizer* xUpdatedOnDevice,
+ gmx_wallcycle* wcycle) :
deviceContext_(deviceContext),
deviceStream_(deviceStream),
- coordinatesReady_(xUpdatedOnDevice)
+ coordinatesReady_(xUpdatedOnDevice),
+ wcycle_(wcycle)
{
GMX_ASSERT(xUpdatedOnDevice != nullptr, "The event synchronizer can not be nullptr.");
const t_mdatoms& md,
const int numTempScaleValues)
{
+ // TODO wallcycle
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+
GMX_ASSERT(d_x != nullptr, "Coordinates device buffer should not be null.");
GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null.");
GMX_ASSERT(d_f != nullptr, "Forces device buffer should not be null.");
coordinateScalingKernelLaunchConfig_.gridSize[0] =
(numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
void UpdateConstrainGpu::Impl::setPbc(const PbcType pbcType, const matrix box)
{
+ // TODO wallcycle
setPbcAiuc(numPbcDimensions(pbcType), box, &pbcAiuc_);
}
const gmx_mtop_t& mtop,
const DeviceContext& deviceContext,
const DeviceStream& deviceStream,
- GpuEventSynchronizer* xUpdatedOnDevice) :
- impl_(new Impl(ir, mtop, deviceContext, deviceStream, xUpdatedOnDevice))
+ GpuEventSynchronizer* xUpdatedOnDevice,
+ gmx_wallcycle* wcycle) :
+ impl_(new Impl(ir, mtop, deviceContext, deviceStream, xUpdatedOnDevice, wcycle))
{
}
/*! \brief Create Update-Constrain object.
*
* The constructor is given a non-nullptr \p deviceStream, in which all the update and constrain
- * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that modify
- * coordinates. The event is maintained outside this class and also passed to all (if any) consumers
- * of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr because the
- * markEvent(...) method is called unconditionally.
+ * routines are executed. \p xUpdatedOnDevice should mark the completion of all kernels that
+ * modify coordinates. The event is maintained outside this class and also passed to all (if
+ * any) consumers of the updated coordinates. The \p xUpdatedOnDevice also can not be a nullptr
+ * because the markEvent(...) method is called unconditionally.
*
* \param[in] ir Input record data: LINCS takes number of iterations and order of
* projection from it.
* and target O-H and H-H distances from this object.
* \param[in] deviceContext GPU device context.
* \param[in] deviceStream GPU stream to use.
- * \param[in] xUpdatedOnDevice The event synchronizer to use to mark that update is done on the GPU.
+ * \param[in] xUpdatedOnDevice The event synchronizer to use to mark that
+ * update is done on the GPU.
+ * \param[in] wcycle The wallclock counter
*/
Impl(const t_inputrec& ir,
const gmx_mtop_t& mtop,
const DeviceContext& deviceContext,
const DeviceStream& deviceStream,
- GpuEventSynchronizer* xUpdatedOnDevice);
+ GpuEventSynchronizer* xUpdatedOnDevice,
+ gmx_wallcycle* wcycle);
~Impl();
//! An pointer to the event to indicate when the update of coordinates is complete
GpuEventSynchronizer* coordinatesReady_;
+ //! The wallclock counter
+ gmx_wallcycle* wcycle_ = nullptr;
};
} // namespace gmx
integrator = std::make_unique<UpdateConstrainGpu>(
*ir, *top_global, fr->deviceStreamManager->context(),
fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
- stateGpu->xUpdatedOnDevice());
+ stateGpu->xUpdatedOnDevice(), wcycle);
integrator->setPbc(PbcType::Xyz, state->box);
}
if (useGpuForUpdate)
{
+ wallcycle_stop(wcycle, ewcUPDATE);
+
if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
{
integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(),
"Launch GPU NB F buffer ops.",
"Launch GPU Comm. coord.",
"Launch GPU Comm. force.",
+ "Launch GPU update",
"Test subcounter",
};
ewcsLAUNCH_GPU_NB_F_BUF_OPS,
ewcsLAUNCH_GPU_MOVEX,
ewcsLAUNCH_GPU_MOVEF,
+ ewcsLAUNCH_GPU_UPDATE_CONSTRAIN,
ewcsTEST,
ewcsNR
};