#include "gromacs/gpu_utils/vectype_ops.cuh"
#include "gromacs/math/vectypes.h"
#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/timing/wallcycle.h"
#include "gromacs/utility/gmxmpi.h"
#include "domdec_internal.h"
coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
}
+ wallcycle_start(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEX);
+
// launch kernel to pack send buffer
KernelLaunchConfig config;
config.blockSize[0] = c_threadsPerBlock;
"Domdec GPU Apply X Halo Exchange", kernelArgs);
}
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEX);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+
+ // Consider time spent in communicateHaloData as Comm.X counter
+ // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
+ wallcycle_start(wcycle_, ewcMOVEX);
+
communicateHaloData(d_x_, HaloQuantity::HaloCoordinates, coordinatesReadyOnDeviceEvent);
+ wallcycle_stop(wcycle_, ewcMOVEX);
+
return;
}
// and before the local buffer operations. It operates in the non-local stream.
void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
{
+ // Consider time spent in communicateHaloData as Comm.F counter
+ // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
+ wallcycle_start(wcycle_, ewcMOVEF);
// Communicate halo data (in non-local stream)
communicateHaloData(d_f_, HaloQuantity::HaloForces, nullptr);
+ wallcycle_stop(wcycle_, ewcMOVEF);
+
+ wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+ wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEF);
+
float3* d_f = d_f_;
if (pulse_ == (dd_->comm->cd[0].numPulses() - 1))
{
fReadyOnDevice_.markEvent(nonLocalStream_);
}
+
+ wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEF);
+ wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
}
{
stat = cudaMemcpyAsync(remotePtr, sendPtr, sendSize * DIM * sizeof(float),
cudaMemcpyDeviceToDevice, nonLocalStream_.stream());
+
CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
}
const DeviceContext& deviceContext,
const DeviceStream& localStream,
const DeviceStream& nonLocalStream,
- int pulse) :
+ int pulse,
+ gmx_wallcycle* wcycle) :
dd_(dd),
sendRankX_(dd->neighbor[0][1]),
recvRankX_(dd->neighbor[0][0]),
deviceContext_(deviceContext),
localStream_(localStream),
nonLocalStream_(nonLocalStream),
- pulse_(pulse)
+ pulse_(pulse),
+ wcycle_(wcycle)
{
GMX_RELEASE_ASSERT(GMX_THREAD_MPI,
const DeviceContext& deviceContext,
const DeviceStream& localStream,
const DeviceStream& nonLocalStream,
- int pulse) :
- impl_(new Impl(dd, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse))
+ int pulse,
+ gmx_wallcycle* wcycle) :
+ impl_(new Impl(dd, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse, wcycle))
{
}