#include "gromacs/utility/smalloc.h"
#include "pme_gpu_internal.h"
+#include "pme_internal.h"
#include "pme_output.h"
#include "pme_pp_communication.h"
-/*! \brief environment variable to enable GPU P2P communication */
-static const bool c_enableGpuPmePpComms =
- GMX_GPU_CUDA && GMX_THREAD_MPI && (getenv("GMX_GPU_PME_PP_COMMS") != nullptr);
-
/*! \brief Master PP-PME communication data structure */
struct gmx_pme_pp
{
/*! \brief whether GPU direct communications are active for PME-PP transfers */
bool useGpuDirectComm = false;
+ /*! \brief whether GPU direct communications should send forces directly to remote GPU memory */
+ bool sendForcesDirectToPpGpu = false;
};
/*! \brief Initialize the PME-only side of the PME <-> PP communication */
return pme_pp;
}
-static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
+static void reset_pmeonly_counters(gmx_wallcycle* wcycle,
gmx_walltime_accounting_t walltime_accounting,
t_nrnb* nrnb,
int64_t step,
bool useGpuForPme)
{
/* Reset all the counters related to performance over the run */
- wallcycle_stop(wcycle, ewcRUN);
+ wallcycle_stop(wcycle, WallCycleCounter::Run);
wallcycle_reset_all(wcycle);
*nrnb = { 0 };
- wallcycle_start(wcycle, ewcRUN);
+ wallcycle_start(wcycle, WallCycleCounter::Run);
walltime_accounting_reset_time(walltime_accounting, step);
if (useGpuForPme)
}
/*! \brief Called by PME-only ranks to receive coefficients and coordinates
+ *
+ * Note that with GPU direct communication the transfer is only initiated, it is the responsibility
+ * of the caller to synchronize prior to launching spread.
*
* \param[in] pme PME data structure.
* \param[in,out] pme_pp PME-PP communication structure.
real* ewaldcoeff_lj,
bool useGpuForPme,
gmx::StatePropagatorDataGpu* stateGpu,
- PmeRunMode gmx_unused runMode)
+ PmeRunMode gmx_unused runMode)
{
int status = -1;
int nat = 0;
#if GMX_MPI
- unsigned int flags = 0;
- int messages = 0;
- bool atomSetChanged = false;
+ int messages = 0;
+ bool atomSetChanged = false;
do
{
/* Receive the send count, box and time step from the peer PP node */
MPI_Recv(&cnb, sizeof(cnb), MPI_BYTE, pme_pp->peerRankId, eCommType_CNB, pme_pp->mpi_comm_mysim, MPI_STATUS_IGNORE);
- /* We accumulate all received flags */
- flags |= cnb.flags;
-
*step = cnb.step;
if (debug)
GMX_ASSERT(!pme_pp->useGpuDirectComm || (pme_pp->pmeForceSenderGpu != nullptr),
"The use of GPU direct communication for PME-PP is enabled, "
"but the PME GPU force reciever object does not exist");
+ pme_pp->sendForcesDirectToPpGpu = ((cnb.flags & PP_PME_RECVFTOGPU) != 0);
if (cnb.flags & PP_PME_FINISH)
{
{
if (atomSetChanged)
{
- gmx_pme_reinit_atoms(pme, nat, pme_pp->chargeA.data(), pme_pp->chargeB.data());
+ gmx_pme_reinit_atoms(pme, nat, pme_pp->chargeA, pme_pp->chargeB);
if (useGpuForPme)
{
stateGpu->reinit(nat, nat);
"GPU Direct PME-PP communication has been enabled, "
"but PME run mode is not PmeRunMode::GPU\n");
- // This rank will have its data accessed directly by PP rank, so needs to send the remote addresses.
- pme_pp->pmeCoordinateReceiverGpu->sendCoordinateBufferAddressToPpRanks(
- stateGpu->getCoordinates());
- pme_pp->pmeForceSenderGpu->sendForceBufferAddressToPpRanks(
- reinterpret_cast<rvec*>(pme_gpu_get_device_f(pme)));
+ // This rank will have its data accessed directly by PP rank, so needs to send the remote addresses and re-set atom ranges associated with transfers.
+ pme_pp->pmeCoordinateReceiverGpu->reinitCoordinateReceiver(stateGpu->getCoordinates());
+ pme_pp->pmeForceSenderGpu->setForceSendBuffer(pme_gpu_get_device_f(pme));
}
}
{
if (pme_pp->useGpuDirectComm)
{
- pme_pp->pmeCoordinateReceiverGpu->receiveCoordinatesSynchronizerFromPpCudaDirect(
- sender.rankId);
+ if (GMX_THREAD_MPI)
+ {
+ pme_pp->pmeCoordinateReceiverGpu->receiveCoordinatesSynchronizerFromPpCudaDirect(
+ sender.rankId);
+ }
+ else
+ {
+ pme_pp->pmeCoordinateReceiverGpu->launchReceiveCoordinatesFromPpCudaMpi(
+ stateGpu->getCoordinates(), nat, sender.numAtoms * sizeof(rvec), sender.rankId);
+ }
}
else
{
}
}
- if (pme_pp->useGpuDirectComm)
- {
- pme_pp->pmeCoordinateReceiverGpu->enqueueWaitReceiveCoordinatesFromPpCudaDirect();
- }
-
status = pmerecvqxX;
}
}
/*! \brief Send the PME mesh force, virial and energy to the PP-only ranks. */
-static void gmx_pme_send_force_vir_ener(gmx_pme_pp* pme_pp,
+static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
+ gmx_pme_pp* pme_pp,
const PmeOutput& output,
real dvdlambda_q,
real dvdlambda_lj,
int messages, ind_start, ind_end;
cve.cycles = cycles;
- /* Now the evaluated forces have to be transferred to the PP nodes */
+ if (pme_pp->useGpuDirectComm)
+ {
+ GMX_ASSERT((pme_pp->pmeForceSenderGpu != nullptr),
+ "The use of GPU direct communication for PME-PP is enabled, "
+ "but the PME GPU force reciever object does not exist");
+ }
+
messages = 0;
ind_end = 0;
- for (const auto& receiver : pme_pp->ppRanks)
+
+ /* Now the evaluated forces have to be transferred to the PP ranks */
+ if (pme_pp->useGpuDirectComm && GMX_THREAD_MPI)
{
- ind_start = ind_end;
- ind_end = ind_start + receiver.numAtoms;
- void* sendbuf = const_cast<void*>(static_cast<const void*>(output.forces_[ind_start]));
- if (pme_pp->useGpuDirectComm)
+ int numPpRanks = static_cast<int>(pme_pp->ppRanks.size());
+# pragma omp parallel for num_threads(std::min(numPpRanks, pme.nthread)) schedule(static)
+ for (int i = 0; i < numPpRanks; i++)
{
- GMX_ASSERT((pme_pp->pmeForceSenderGpu != nullptr),
- "The use of GPU direct communication for PME-PP is enabled, "
- "but the PME GPU force reciever object does not exist");
- pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(receiver.rankId);
+ auto& receiver = pme_pp->ppRanks[i];
+ pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(
+ receiver.rankId, receiver.numAtoms, pme_pp->sendForcesDirectToPpGpu);
}
- else
+ }
+ else
+ {
+ for (const auto& receiver : pme_pp->ppRanks)
{
- // Send using MPI
- MPI_Isend(sendbuf,
- receiver.numAtoms * sizeof(rvec),
- MPI_BYTE,
- receiver.rankId,
- 0,
- pme_pp->mpi_comm_mysim,
- &pme_pp->req[messages]);
+ ind_start = ind_end;
+ ind_end = ind_start + receiver.numAtoms;
+ if (pme_pp->useGpuDirectComm)
+ {
+ pme_pp->pmeForceSenderGpu->sendFToPpCudaMpi(pme_gpu_get_device_f(&pme),
+ ind_start,
+ receiver.numAtoms * sizeof(rvec),
+ receiver.rankId,
+ &pme_pp->req[messages]);
+ }
+ else
+ {
+ void* sendbuf = const_cast<void*>(static_cast<const void*>(output.forces_[ind_start]));
+ // Send using MPI
+ MPI_Isend(sendbuf,
+ receiver.numAtoms * sizeof(rvec),
+ MPI_BYTE,
+ receiver.rankId,
+ 0,
+ pme_pp->mpi_comm_mysim,
+ &pme_pp->req[messages]);
+ }
messages++;
}
}
MPI_Waitall(messages, pme_pp->req.data(), pme_pp->stat.data());
#else
GMX_RELEASE_ASSERT(false, "Invalid call to gmx_pme_send_force_vir_ener");
+ GMX_UNUSED_VALUE(pme);
GMX_UNUSED_VALUE(pme_pp);
GMX_UNUSED_VALUE(output);
GMX_UNUSED_VALUE(dvdlambda_q);
gmx_walltime_accounting_t walltime_accounting,
t_inputrec* ir,
PmeRunMode runMode,
+ bool useGpuPmePpCommunication,
const gmx::DeviceStreamManager* deviceStreamManager)
{
int ret;
"Device stream can not be nullptr when using GPU in PME-only rank");
changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
- if (c_enableGpuPmePpComms)
+ if (useGpuPmePpCommunication)
{
pme_pp->pmeCoordinateReceiverGpu = std::make_unique<gmx::PmeCoordinateReceiverGpu>(
- deviceStreamManager->stream(gmx::DeviceStreamType::Pme),
- pme_pp->mpi_comm_mysim,
- pme_pp->ppRanks);
- pme_pp->pmeForceSenderGpu = std::make_unique<gmx::PmeForceSenderGpu>(
- pme_gpu_get_f_ready_synchronizer(pme), pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
+ pme_pp->mpi_comm_mysim, deviceStreamManager->context(), pme_pp->ppRanks);
+ pme_pp->pmeForceSenderGpu =
+ std::make_unique<gmx::PmeForceSenderGpu>(pme_gpu_get_f_ready_synchronizer(pme),
+ pme_pp->mpi_comm_mysim,
+ deviceStreamManager->context(),
+ pme_pp->ppRanks);
}
// TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here.
// This should be made safer.
if (count == 0)
{
- wallcycle_start(wcycle, ewcRUN);
+ wallcycle_start(wcycle, WallCycleCounter::Run);
walltime_accounting_start_time(walltime_accounting);
}
- wallcycle_start(wcycle, ewcPMEMESH);
+ wallcycle_start(wcycle, WallCycleCounter::PmeMesh);
dvdlambda_q = 0;
dvdlambda_lj = 0;
pme_gpu_prepare_computation(pme, box, wcycle, stepWork);
if (!pme_pp->useGpuDirectComm)
{
- stateGpu->copyCoordinatesToGpu(gmx::ArrayRef<gmx::RVec>(pme_pp->x), gmx::AtomLocality::All);
+ stateGpu->copyCoordinatesToGpu(gmx::ArrayRef<gmx::RVec>(pme_pp->x),
+ gmx::AtomLocality::Local);
}
// On the separate PME rank we do not need a synchronizer as we schedule everything in a single stream
// TODO: with pme on GPU the receive should make a list of synchronizers and pass it here #3157
auto xReadyOnDevice = nullptr;
- pme_gpu_launch_spread(pme, xReadyOnDevice, wcycle, lambda_q);
+ pme_gpu_launch_spread(pme,
+ xReadyOnDevice,
+ wcycle,
+ lambda_q,
+ pme_pp->useGpuDirectComm,
+ pme_pp->pmeCoordinateReceiverGpu.get());
pme_gpu_launch_complex_transforms(pme, wcycle, stepWork);
pme_gpu_launch_gather(pme, wcycle, lambda_q);
output = pme_gpu_wait_finish_task(pme, computeEnergyAndVirial, lambda_q, wcycle);
output.forces_ = pme_pp->f;
}
- cycles = wallcycle_stop(wcycle, ewcPMEMESH);
- gmx_pme_send_force_vir_ener(pme_pp.get(), output, dvdlambda_q, dvdlambda_lj, cycles);
+ cycles = wallcycle_stop(wcycle, WallCycleCounter::PmeMesh);
+ gmx_pme_send_force_vir_ener(*pme, pme_pp.get(), output, dvdlambda_q, dvdlambda_lj, cycles);
count++;
} /***** end of quasi-loop, we stop with the break above */