#include "gromacs/gpu_utils/devicebuffer_datatype.h"
#include "gromacs/math/vectypes.h"
+#include "gromacs/timing/wallcycle.h"
#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/classhelpers.h"
#include "gromacs/utility/fixedcapacityvector.h"
*
* \param [in] deviceContext GPU device context
* \param [in] deviceStream Stream to use for reduction
+ * \param [in] wcycle Wall-clock cycle counter
*/
- GpuForceReduction(const DeviceContext& deviceContext, const DeviceStream& deviceStream);
+ GpuForceReduction(const DeviceContext& deviceContext,
+ const DeviceStream& deviceStream,
+ gmx_wallcycle* wcycle);
~GpuForceReduction();
/*! \brief Register a nbnxm-format force to be reduced
return;
}
-GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
+GpuForceReduction::Impl::Impl(const DeviceContext& deviceContext,
+ const DeviceStream& deviceStream,
+ gmx_wallcycle* wcycle) :
deviceContext_(deviceContext),
- deviceStream_(deviceStream){};
+ deviceStream_(deviceStream),
+ wcycle_(wcycle){};
void GpuForceReduction::Impl::reinit(float3* baseForcePtr,
const int numAtoms,
accumulate_ = accumulate;
completionMarker_ = completionMarker;
cellInfo_.cell = cell.data();
+
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
reallocateDeviceBuffer(&cellInfo_.d_cell, numAtoms_, &cellInfo_.cellSize,
&cellInfo_.cellSizeAlloc, deviceContext_);
copyToDeviceBuffer(&cellInfo_.d_cell, &(cellInfo_.cell[atomStart]), 0, numAtoms_, deviceStream_,
GpuApiCallBehavior::Async, nullptr);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
dependencyList_.clear();
};
void GpuForceReduction::Impl::execute()
{
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
if (numAtoms_ == 0)
{
{
completionMarker_->markEvent(deviceStream_);
}
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
GpuForceReduction::Impl::~Impl(){};
-GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
- impl_(new Impl(deviceContext, deviceStream))
+GpuForceReduction::GpuForceReduction(const DeviceContext& deviceContext,
+ const DeviceStream& deviceStream,
+ gmx_wallcycle* wcycle) :
+ impl_(new Impl(deviceContext, deviceStream, wcycle))
{
}
*
* \param [in] deviceStream Stream to use for reduction
* \param [in] deviceContext GPU device context
+ * \param [in] wcycle The wallclock counter
*/
- Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStream);
+ Impl(const DeviceContext& deviceContext, const DeviceStream& deviceStreami, gmx_wallcycle* wcycle);
~Impl();
/*! \brief Register a nbnxm-format force to be reduced
DeviceBuffer<RVec> rvecForceToAdd_ = nullptr;
//! event to be marked when redcution launch has been completed
GpuEventSynchronizer* completionMarker_ = nullptr;
+ //! The wallclock counter
+ gmx_wallcycle* wcycle_ = nullptr;
};
} // namespace gmx
{
fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
deviceStreamManager->context(),
- deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal));
+ deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal), wcycle);
fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique<gmx::GpuForceReduction>(
deviceStreamManager->context(),
- deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal));
+ deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal), wcycle);
}
std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;