Replace buffer ops counters with GPU launch counters for GPU backend.
void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
const t_commrec& cr,
- const gmx::DeviceStreamManager& deviceStreamManager)
+ const gmx::DeviceStreamManager& deviceStreamManager,
+ gmx_wallcycle* wcycle)
{
GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
"Local non-bonded stream should be valid when using"
cr.dd->gpuHaloExchange.push_back(std::make_unique<gmx::GpuHaloExchange>(
cr.dd, cr.mpi_comm_mysim, deviceStreamManager.context(),
deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
- deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal), pulse));
+ deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal), pulse, wcycle));
}
}
}
* \param[in] mdlog The logger object.
* \param[in] cr The commrec object.
* \param[in] deviceStreamManager Manager of the GPU context and streams.
+ * \param[in] wcycle The wallclock counter.
*/
void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
const t_commrec& cr,
- const gmx::DeviceStreamManager& deviceStreamManager);
+ const gmx::DeviceStreamManager& deviceStreamManager,
+ gmx_wallcycle* wcycle);
/*! \brief
* (Re-) Initialization for GPU halo exchange
#include "gromacs/utility/gmxmpi.h"
struct gmx_domdec_t;
+struct gmx_wallcycle;
class DeviceContext;
class DeviceStream;
class GpuEventSynchronizer;
* \param [in] streamLocal local NB CUDA stream.
* \param [in] streamNonLocal non-local NB CUDA stream.
* \param [in] pulse the communication pulse for this instance
+ * \param [in] wcycle The wallclock counter
*/
GpuHaloExchange(gmx_domdec_t* dd,
MPI_Comm mpi_comm_mysim,
const DeviceContext& deviceContext,
const DeviceStream& streamLocal,
const DeviceStream& streamNonLocal,
- int pulse);
+ int pulse,
+ gmx_wallcycle* wcycle);
~GpuHaloExchange();
/*! \brief
const DeviceContext& /* deviceContext */,
const DeviceStream& /*streamLocal */,
const DeviceStream& /*streamNonLocal */,
- int /*pulse */) :
+ int /*pulse */,
+ gmx_wallcycle* /*wcycle*/) :
impl_(nullptr)
{
GMX_ASSERT(false,
#include "gromacs/gpu_utils/vectype_ops.cuh"
#include "gromacs/math/vectypes.h"
#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/timing/wallcycle.h"
#include "gromacs/utility/gmxmpi.h"
#include "domdec_internal.h"
coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
}
+ wallcycle_start(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEX);
+
// launch kernel to pack send buffer
KernelLaunchConfig config;
config.blockSize[0] = c_threadsPerBlock;
"Domdec GPU Apply X Halo Exchange", kernelArgs);
}
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEX);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+
+ // Consider time spent in communicateHaloData as Comm.X counter
+ // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
+ wallcycle_start(wcycle_, ewcMOVEX);
+
communicateHaloData(d_x_, HaloQuantity::HaloCoordinates, coordinatesReadyOnDeviceEvent);
+ wallcycle_stop(wcycle_, ewcMOVEX);
+
return;
}
// and before the local buffer operations. It operates in the non-local stream.
void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
{
+ // Consider time spent in communicateHaloData as Comm.F counter
+ // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
+ wallcycle_start(wcycle_, ewcMOVEF);
// Communicate halo data (in non-local stream)
communicateHaloData(d_f_, HaloQuantity::HaloForces, nullptr);
+ wallcycle_stop(wcycle_, ewcMOVEF);
+
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEF);
+
float3* d_f = d_f_;
if (pulse_ == (dd_->comm->cd[0].numPulses() - 1))
{
fReadyOnDevice_.markEvent(nonLocalStream_);
}
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEF);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
{
stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize * DIM * sizeof(float),
cudaMemcpyDeviceToDevice, nonLocalStream_.stream());
+
CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
}
const DeviceContext& deviceContext,
const DeviceStream& localStream,
const DeviceStream& nonLocalStream,
- int pulse) :
+ int pulse,
+ gmx_wallcycle* wcycle) :
dd_(dd),
sendRankX_(dd->neighbor[0][1]),
recvRankX_(dd->neighbor[0][0]),
deviceContext_(deviceContext),
localStream_(localStream),
nonLocalStream_(nonLocalStream),
- pulse_(pulse)
+ pulse_(pulse),
+ wcycle_(wcycle)
{
GMX_RELEASE_ASSERT(GMX_THREAD_MPI,
const DeviceContext& deviceContext,
const DeviceStream& localStream,
const DeviceStream& nonLocalStream,
- int pulse) :
- impl_(new Impl(dd, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse))
+ int pulse,
+ gmx_wallcycle* wcycle) :
+ impl_(new Impl(dd, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse, wcycle))
{
}
#include "gromacs/gpu_utils/hostallocator.h"
#include "gromacs/utility/gmxmpi.h"
+struct gmx_wallcycle;
+
namespace gmx
{
* \param [in] localStream local NB CUDA stream
* \param [in] nonLocalStream non-local NB CUDA stream
* \param [in] pulse the communication pulse for this instance
+ * \param [in] wcycle The wallclock counter
*/
Impl(gmx_domdec_t* dd,
MPI_Comm mpi_comm_mysim,
const DeviceContext& deviceContext,
const DeviceStream& localStream,
const DeviceStream& nonLocalStream,
- int pulse);
+ int pulse,
+ gmx_wallcycle* wcycle);
~Impl();
/*! \brief
int pulse_ = 0;
//! Number of zones. Always 1 for 1-D case.
const int nzone_ = 1;
+ //! The wallclock counter
+ gmx_wallcycle* wcycle_ = nullptr;
};
} // namespace gmx
"GPU device manager has to be initialized to use GPU "
"version of halo exchange.");
// TODO remove need to pass local stream into GPU halo exchange - Issue #3093
- constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager);
+ constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
}
}
}
DeviceBuffer<gmx::RVec> d_x,
GpuEventSynchronizer* xReadyOnDevice)
{
- wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
- wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
+ wallcycle_start(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_X_BUF_OPS);
nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, fillLocal, gpu_nbv, d_x, xReadyOnDevice);
- wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
- wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_X_BUF_OPS);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
gmx::ArrayRef<const int> nonbonded_verlet_t::getGridIndices() const
return;
}
- wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
- wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
+ wallcycle_start(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
reduceForcesGpu(locality, totalForcesDevice, pairSearch_->gridSet(), forcesPmeDevice,
dependencyList, gpu_nbv, useGpuFPmeReduction, accumulateForce);
- wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
- wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
void nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* const localReductionDone)
"NB X buffer ops.",
"NB F buffer ops.",
"Clear force buffer",
+ "Launch GPU NB X buffer ops.",
+ "Launch GPU NB F buffer ops.",
+ "Launch GPU Comm. coord.",
+ "Launch GPU Comm. force."
"Test subcounter",
};
ewcsNB_X_BUF_OPS,
ewcsNB_F_BUF_OPS,
ewcsCLEAR_FORCE_BUFFER,
+ ewcsLAUNCH_GPU_NB_X_BUF_OPS,
+ ewcsLAUNCH_GPU_NB_F_BUF_OPS,
+ ewcsLAUNCH_GPU_MOVEX,
+ ewcsLAUNCH_GPU_MOVEF,
ewcsTEST,
ewcsNR
};