Launch timing was missing in the GPU reduction refactoring added in
257d80. This change adds back the cycle counting reducing the time
leaking into the rest timer.
#include "gromacs/gpu_utils/devicebuffer_datatype.h"
#include "gromacs/math/vectypes.h"
#include "gromacs/gpu_utils/devicebuffer_datatype.h"
#include "gromacs/math/vectypes.h"
+#include "gromacs/timing/wallcycle.h"
#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/classhelpers.h"
#include "gromacs/utility/fixedcapacityvector.h"
#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/classhelpers.h"
#include "gromacs/utility/fixedcapacityvector.h"
*
* \param [in] deviceContext GPU device context
* \param [in] deviceStream Stream to use for reduction
*
* \param [in] deviceContext GPU device context
* \param [in] deviceStream Stream to use for reduction
+ * \param [in] wcycle Wall-clock cycle counter
- GpuForceReduction(const DeviceContext& deviceContext, const DeviceStream& deviceStream);
+ GpuForceReduction(const DeviceContext& deviceContext,
+ const DeviceStream& deviceStream,
+ gmx_wallcycle* wcycle);
~GpuForceReduction();
/*! \brief Register a nbnxm-format force to be reduced
~GpuForceReduction();
/*! \brief Register a nbnxm-format force to be reduced
};
GpuForceReduction::GpuForceReduction(const DeviceContext& /* deviceContext */,
};
GpuForceReduction::GpuForceReduction(const DeviceContext& /* deviceContext */,
- const DeviceStream& /* deviceStream */) :
+ const DeviceStream& /* deviceStream */,
+ gmx_wallcycle* /*wcycle*/) :
impl_(nullptr)
{
GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
impl_(nullptr)
{
GMX_ASSERT(false, "A CPU stub has been called instead of the correct implementation.");
-GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
+GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext,
+ const DeviceStream& deviceStream,
+ gmx_wallcycle* wcycle) :
deviceContext_(deviceContext),
deviceContext_(deviceContext),
- deviceStream_(deviceStream){};
+ deviceStream_(deviceStream),
+ wcycle_(wcycle){};
void GpuForceReduction::Impl::reinit(float3* baseForcePtr,
const int numAtoms,
void GpuForceReduction::Impl::reinit(float3* baseForcePtr,
const int numAtoms,
accumulate_ = accumulate;
completionMarker_ = completionMarker;
cellInfo_.cell = cell.data();
accumulate_ = accumulate;
completionMarker_ = completionMarker;
cellInfo_.cell = cell.data();
+
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
reallocateDeviceBuffer(&cellInfo_.d_cell, numAtoms_, &cellInfo_.cellSize,
&cellInfo_.cellSizeAlloc, deviceContext_);
copyToDeviceBuffer(&cellInfo_.d_cell, &(cellInfo_.cell[atomStart]), 0, numAtoms_, deviceStream_,
GpuApiCallBehavior::Async, nullptr);
reallocateDeviceBuffer(&cellInfo_.d_cell, numAtoms_, &cellInfo_.cellSize,
&cellInfo_.cellSizeAlloc, deviceContext_);
copyToDeviceBuffer(&cellInfo_.d_cell, &(cellInfo_.cell[atomStart]), 0, numAtoms_, deviceStream_,
GpuApiCallBehavior::Async, nullptr);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
dependencyList_.clear();
};
dependencyList_.clear();
};
void GpuForceReduction::Impl::execute()
{
void GpuForceReduction::Impl::execute()
{
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
{
completionMarker_->markEvent(deviceStream_);
}
{
completionMarker_->markEvent(deviceStream_);
}
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
GpuForceReduction::Impl::~Impl(){};
}
GpuForceReduction::Impl::~Impl(){};
-GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
- impl_(new Impl(deviceContext, deviceStream))
+GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext,
+ const DeviceStream& deviceStream,
+ gmx_wallcycle* wcycle) :
+ impl_(new Impl(deviceContext, deviceStream, wcycle))
*
* \param [in] deviceStream Stream to use for reduction
* \param [in] deviceContext GPU device context
*
* \param [in] deviceStream Stream to use for reduction
* \param [in] deviceContext GPU device context
+ * \param [in] wcycle The wallclock counter
- Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream);
+ Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStreami, gmx_wallcycle* wcycle);
~Impl();
/*! \brief Register a nbnxm-format force to be reduced
~Impl();
/*! \brief Register a nbnxm-format force to be reduced
DeviceBuffer<RVec> rvecForceToAdd_ = nullptr;
//! event to be marked when redcution launch has been completed
GpuEventSynchronizer* completionMarker_ = nullptr;
DeviceBuffer<RVec> rvecForceToAdd_ = nullptr;
//! event to be marked when redcution launch has been completed
GpuEventSynchronizer* completionMarker_ = nullptr;
+ //! The wallclock counter
+ gmx_wallcycle* wcycle_ = nullptr;
{
fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
deviceStreamManager->context(),
{
fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
deviceStreamManager->context(),
- deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal));
+ deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal), wcycle);
fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique<gmx::GpuForceReduction>(
deviceStreamManager->context(),
fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique<gmx::GpuForceReduction>(
deviceStreamManager->context(),
- deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal));
+ deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal), wcycle);
}
std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
}
std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;