Pipeline GPU PME Spline/Spread with PP Comms

[alexxy/gromacs.git] / src / gromacs / ewald / pme_only.cpp
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp

index a01b5080c500952ce527a7f2f54ffa3684fea224..56002966be057185ae0bf3924073cd6962b43759 100644 (file)
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -4,7 +4,7 @@
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
   * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -82,6 +82,7 @@
  #include "gromacs/fileio/pdbio.h"
  #include "gromacs/gmxlib/network.h"
  #include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
  #include "gromacs/gpu_utils/hostallocator.h"
  #include "gromacs/math/gmxcomplex.h"
  #include "gromacs/math/units.h"
@@ -100,13 +101,10 @@
  #include "gromacs/utility/smalloc.h"
  
  #include "pme_gpu_internal.h"
+#include "pme_internal.h"
  #include "pme_output.h"
  #include "pme_pp_communication.h"
  
-/*! \brief environment variable to enable GPU P2P communication */
-static const bool c_enableGpuPmePpComms =
-        (getenv("GMX_GPU_PME_PP_COMMS") != nullptr) && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA);
-
  /*! \brief Master PP-PME communication data structure */
  struct gmx_pme_pp
  {
@@ -137,6 +135,8 @@ struct gmx_pme_pp
  
      /*! \brief whether GPU direct communications are active for PME-PP transfers */
      bool useGpuDirectComm = false;
+    /*! \brief whether GPU direct communications should send forces directly to remote GPU memory */
+    bool sendForcesDirectToPpGpu = false;
  };
  
  /*! \brief Initialize the PME-only side of the PME <-> PP communication */
@@ -166,17 +166,17 @@ static std::unique_ptr<gmx_pme_pp> gmx_pme_pp_init(const t_commrec* cr)
      return pme_pp;
  }
  
-static void reset_pmeonly_counters(gmx_wallcycle_t           wcycle,
+static void reset_pmeonly_counters(gmx_wallcycle*            wcycle,
                                     gmx_walltime_accounting_t walltime_accounting,
                                     t_nrnb*                   nrnb,
                                     int64_t                   step,
                                     bool                      useGpuForPme)
  {
      /* Reset all the counters related to performance over the run */
-    wallcycle_stop(wcycle, ewcRUN);
+    wallcycle_stop(wcycle, WallCycleCounter::Run);
      wallcycle_reset_all(wcycle);
      *nrnb = { 0 };
-    wallcycle_start(wcycle, ewcRUN);
+    wallcycle_start(wcycle, WallCycleCounter::Run);
      walltime_accounting_reset_time(walltime_accounting, step);
  
      if (useGpuForPme)
@@ -219,25 +219,31 @@ static gmx_pme_t* gmx_pmeonly_switch(std::vector<gmx_pme_t*>* pmedata,
  
  /*! \brief Called by PME-only ranks to receive coefficients and coordinates
   *
- * \param[in] pme           PME data structure.
- * \param[in,out] pme_pp    PME-PP communication structure.
- * \param[out] natoms       Number of received atoms.
- * \param[out] box        System box, if received.
- * \param[out] maxshift_x        Maximum shift in X direction, if received.
- * \param[out] maxshift_y        Maximum shift in Y direction, if received.
- * \param[out] lambda_q         Free-energy lambda for electrostatics, if received.
- * \param[out] lambda_lj         Free-energy lambda for Lennard-Jones, if received.
- * \param[out] bEnerVir          Set to true if this is an energy/virial calculation step, otherwise
- * set to false. \param[out] step              MD integration step number. \param[out] grid_size PME
- * grid size, if received. \param[out] ewaldcoeff_q         Ewald cut-off parameter for
- * electrostatics, if received. \param[out] ewaldcoeff_lj         Ewald cut-off parameter for
- * Lennard-Jones, if received. \param[in] useGpuForPme      flag on whether PME is on GPU \param[in]
- * stateGpu          GPU state propagator object \param[in] runMode           PME run mode
+ * Note that with GPU direct communication the transfer is only initiated, it is the responsibility
+ * of the caller to synchronize prior to launching spread.
+ *
+ * \param[in] pme                     PME data structure.
+ * \param[in,out] pme_pp              PME-PP communication structure.
+ * \param[out] natoms                 Number of received atoms.
+ * \param[out] box                    System box, if received.
+ * \param[out] maxshift_x             Maximum shift in X direction, if received.
+ * \param[out] maxshift_y             Maximum shift in Y direction, if received.
+ * \param[out] lambda_q               Free-energy lambda for electrostatics, if received.
+ * \param[out] lambda_lj              Free-energy lambda for Lennard-Jones, if received.
+ * \param[out] computeEnergyAndVirial Set to true if this is an energy/virial calculation
+ *                                    step, otherwise set to false.
+ * \param[out] step                   MD integration step number.
+ * \param[out] grid_size              PME grid size, if received.
+ * \param[out] ewaldcoeff_q           Ewald cut-off parameter for electrostatics, if received.
+ * \param[out] ewaldcoeff_lj          Ewald cut-off parameter for Lennard-Jones, if received.
+ * \param[in]  useGpuForPme           Flag on whether PME is on GPU.
+ * \param[in]  stateGpu               GPU state propagator object.
+ * \param[in]  runMode                PME run mode.
   *
- * \retval pmerecvqxX             All parameters were set, chargeA and chargeB can be NULL.
- * \retval pmerecvqxFINISH        No parameters were set.
- * \retval pmerecvqxSWITCHGRID    Only grid_size and *ewaldcoeff were set.
- * \retval pmerecvqxRESETCOUNTERS *step was set.
+ * \retval pmerecvqxX                 All parameters were set, chargeA and chargeB can be NULL.
+ * \retval pmerecvqxFINISH            No parameters were set.
+ * \retval pmerecvqxSWITCHGRID        Only grid_size and *ewaldcoeff were set.
+ * \retval pmerecvqxRESETCOUNTERS     *step was set.
   */
  static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                                        gmx_pme_pp*                  pme_pp,
@@ -247,22 +253,21 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                                        int*                         maxshift_y,
                                        real*                        lambda_q,
                                        real*                        lambda_lj,
-                                      gmx_bool*                    bEnerVir,
+                                      gmx_bool*                    computeEnergyAndVirial,
                                        int64_t*                     step,
                                        ivec*                        grid_size,
                                        real*                        ewaldcoeff_q,
                                        real*                        ewaldcoeff_lj,
                                        bool                         useGpuForPme,
                                        gmx::StatePropagatorDataGpu* stateGpu,
-                                      PmeRunMode gmx_unused runMode)
+                                      PmeRunMode gmx_unused        runMode)
  {
      int status = -1;
      int nat    = 0;
  
  #if GMX_MPI
-    unsigned int flags          = 0;
-    int          messages       = 0;
-    bool         atomSetChanged = false;
+    int  messages       = 0;
+    bool atomSetChanged = false;
  
      do
      {
@@ -270,17 +275,14 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
          cnb.flags = 0;
  
          /* Receive the send count, box and time step from the peer PP node */
-        MPI_Recv(&cnb, sizeof(cnb), MPI_BYTE, pme_pp->peerRankId, eCommType_CNB,
-                 pme_pp->mpi_comm_mysim, MPI_STATUS_IGNORE);
-
-        /* We accumulate all received flags */
-        flags |= cnb.flags;
+        MPI_Recv(&cnb, sizeof(cnb), MPI_BYTE, pme_pp->peerRankId, eCommType_CNB, pme_pp->mpi_comm_mysim, MPI_STATUS_IGNORE);
  
          *step = cnb.step;
  
          if (debug)
          {
-            fprintf(debug, "PME only rank receiving:%s%s%s%s%s\n",
+            fprintf(debug,
+                    "PME only rank receiving:%s%s%s%s%s\n",
                      (cnb.flags & PP_PME_CHARGE) ? " charges" : "",
                      (cnb.flags & PP_PME_COORD) ? " coordinates" : "",
                      (cnb.flags & PP_PME_FINISH) ? " finish" : "",
@@ -292,6 +294,7 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
          GMX_ASSERT(!pme_pp->useGpuDirectComm || (pme_pp->pmeForceSenderGpu != nullptr),
                     "The use of GPU direct communication for PME-PP is enabled, "
                     "but the PME GPU force reciever object does not exist");
+        pme_pp->sendForcesDirectToPpGpu = ((cnb.flags & PP_PME_RECVFTOGPU) != 0);
  
          if (cnb.flags & PP_PME_FINISH)
          {
@@ -327,8 +330,13 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                  }
                  else
                  {
-                    MPI_Irecv(&sender.numAtoms, sizeof(sender.numAtoms), MPI_BYTE, sender.rankId,
-                              eCommType_CNB, pme_pp->mpi_comm_mysim, &pme_pp->req[messages++]);
+                    MPI_Irecv(&sender.numAtoms,
+                              sizeof(sender.numAtoms),
+                              MPI_BYTE,
+                              sender.rankId,
+                              eCommType_CNB,
+                              pme_pp->mpi_comm_mysim,
+                              &pme_pp->req[messages++]);
                  }
              }
              MPI_Waitall(messages, pme_pp->req.data(), pme_pp->stat.data());
@@ -395,12 +403,19 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                  {
                      if (sender.numAtoms > 0)
                      {
-                        MPI_Irecv(bufferPtr + nat, sender.numAtoms * sizeof(real), MPI_BYTE,
-                                  sender.rankId, q, pme_pp->mpi_comm_mysim, &pme_pp->req[messages++]);
+                        MPI_Irecv(bufferPtr + nat,
+                                  sender.numAtoms * sizeof(real),
+                                  MPI_BYTE,
+                                  sender.rankId,
+                                  q,
+                                  pme_pp->mpi_comm_mysim,
+                                  &pme_pp->req[messages++]);
                          nat += sender.numAtoms;
                          if (debug)
                          {
-                            fprintf(debug, "Received from PP rank %d: %d %s\n", sender.rankId,
+                            fprintf(debug,
+                                    "Received from PP rank %d: %d %s\n",
+                                    sender.rankId,
                                      sender.numAtoms,
                                      (q == eCommType_ChargeA || q == eCommType_ChargeB) ? "charges"
                                                                                         : "params");
@@ -414,7 +429,7 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
          {
              if (atomSetChanged)
              {
-                gmx_pme_reinit_atoms(pme, nat, pme_pp->chargeA.data());
+                gmx_pme_reinit_atoms(pme, nat, pme_pp->chargeA, pme_pp->chargeB);
                  if (useGpuForPme)
                  {
                      stateGpu->reinit(nat, nat);
@@ -426,11 +441,9 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                                 "GPU Direct PME-PP communication has been enabled, "
                                 "but PME run mode is not PmeRunMode::GPU\n");
  
-                    // This rank will have its data accessed directly by PP rank, so needs to send the remote addresses.
-                    pme_pp->pmeCoordinateReceiverGpu->sendCoordinateBufferAddressToPpRanks(
-                            stateGpu->getCoordinates());
-                    pme_pp->pmeForceSenderGpu->sendForceBufferAddressToPpRanks(
-                            reinterpret_cast<rvec*>(pme_gpu_get_device_f(pme)));
+                    // This rank will have its data accessed directly by PP rank, so needs to send the remote addresses and re-set atom ranges associated with transfers.
+                    pme_pp->pmeCoordinateReceiverGpu->reinitCoordinateReceiver(stateGpu->getCoordinates());
+                    pme_pp->pmeForceSenderGpu->setForceSendBuffer(pme_gpu_get_device_f(pme));
                  }
              }
  
@@ -438,10 +451,10 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
              /* The box, FE flag and lambda are sent along with the coordinates
               *  */
              copy_mat(cnb.box, box);
-            *lambda_q  = cnb.lambda_q;
-            *lambda_lj = cnb.lambda_lj;
-            *bEnerVir  = ((cnb.flags & PP_PME_ENER_VIR) != 0U);
-            *step      = cnb.step;
+            *lambda_q               = cnb.lambda_q;
+            *lambda_lj              = cnb.lambda_lj;
+            *computeEnergyAndVirial = ((cnb.flags & PP_PME_ENER_VIR) != 0U);
+            *step                   = cnb.step;
  
              /* Receive the coordinates in place */
              nat = 0;
@@ -451,13 +464,26 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                  {
                      if (pme_pp->useGpuDirectComm)
                      {
-                        pme_pp->pmeCoordinateReceiverGpu->launchReceiveCoordinatesFromPpCudaDirect(
-                                sender.rankId);
+                        if (GMX_THREAD_MPI)
+                        {
+                            pme_pp->pmeCoordinateReceiverGpu->receiveCoordinatesSynchronizerFromPpCudaDirect(
+                                    sender.rankId);
+                        }
+                        else
+                        {
+                            pme_pp->pmeCoordinateReceiverGpu->launchReceiveCoordinatesFromPpCudaMpi(
+                                    stateGpu->getCoordinates(), nat, sender.numAtoms * sizeof(rvec), sender.rankId);
+                        }
                      }
                      else
                      {
-                        MPI_Irecv(pme_pp->x[nat], sender.numAtoms * sizeof(rvec), MPI_BYTE, sender.rankId,
-                                  eCommType_COORD, pme_pp->mpi_comm_mysim, &pme_pp->req[messages++]);
+                        MPI_Irecv(pme_pp->x[nat],
+                                  sender.numAtoms * sizeof(rvec),
+                                  MPI_BYTE,
+                                  sender.rankId,
+                                  eCommType_COORD,
+                                  pme_pp->mpi_comm_mysim,
+                                  &pme_pp->req[messages++]);
                      }
                      nat += sender.numAtoms;
                      if (debug)
@@ -465,16 +491,12 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
                          fprintf(debug,
                                  "Received from PP rank %d: %d "
                                  "coordinates\n",
-                                sender.rankId, sender.numAtoms);
+                                sender.rankId,
+                                sender.numAtoms);
                      }
                  }
              }
  
-            if (pme_pp->useGpuDirectComm)
-            {
-                pme_pp->pmeCoordinateReceiverGpu->enqueueWaitReceiveCoordinatesFromPpCudaDirect();
-            }
-
              status = pmerecvqxX;
          }
  
@@ -490,7 +512,7 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
      GMX_UNUSED_VALUE(maxshift_y);
      GMX_UNUSED_VALUE(lambda_q);
      GMX_UNUSED_VALUE(lambda_lj);
-    GMX_UNUSED_VALUE(bEnerVir);
+    GMX_UNUSED_VALUE(computeEnergyAndVirial);
      GMX_UNUSED_VALUE(step);
      GMX_UNUSED_VALUE(grid_size);
      GMX_UNUSED_VALUE(ewaldcoeff_q);
@@ -509,29 +531,6 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t*            pme,
      return status;
  }
  
-#if GMX_MPI
-/*! \brief Send force data to PP ranks */
-static void sendFToPP(void* sendbuf, PpRanks receiver, gmx_pme_pp* pme_pp, int* messages)
-{
-
-    if (pme_pp->useGpuDirectComm)
-    {
-        GMX_ASSERT((pme_pp->pmeForceSenderGpu != nullptr),
-                   "The use of GPU direct communication for PME-PP is enabled, "
-                   "but the PME GPU force reciever object does not exist");
-
-        pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(receiver.rankId);
-    }
-    else
-    {
-        // Send using MPI
-        MPI_Isend(sendbuf, receiver.numAtoms * sizeof(rvec), MPI_BYTE, receiver.rankId, 0,
-                  pme_pp->mpi_comm_mysim, &pme_pp->req[*messages]);
-        *messages = *messages + 1;
-    }
-}
-#endif
-
  /*! \brief Send the PME mesh force, virial and energy to the PP-only ranks. */
  static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
                                          gmx_pme_pp*      pme_pp,
@@ -545,21 +544,56 @@ static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
      int                    messages, ind_start, ind_end;
      cve.cycles = cycles;
  
-    /* Now the evaluated forces have to be transferred to the PP nodes */
+    if (pme_pp->useGpuDirectComm)
+    {
+        GMX_ASSERT((pme_pp->pmeForceSenderGpu != nullptr),
+                   "The use of GPU direct communication for PME-PP is enabled, "
+                   "but the PME GPU force reciever object does not exist");
+    }
+
      messages = 0;
      ind_end  = 0;
-    for (const auto& receiver : pme_pp->ppRanks)
+
+    /* Now the evaluated forces have to be transferred to the PP ranks */
+    if (pme_pp->useGpuDirectComm && GMX_THREAD_MPI)
      {
-        ind_start     = ind_end;
-        ind_end       = ind_start + receiver.numAtoms;
-        void* sendbuf = const_cast<void*>(static_cast<const void*>(output.forces_[ind_start]));
-        if (pme_pp->useGpuDirectComm)
+        int numPpRanks = static_cast<int>(pme_pp->ppRanks.size());
+#    pragma omp parallel for num_threads(std::min(numPpRanks, pme.nthread)) schedule(static)
+        for (int i = 0; i < numPpRanks; i++)
          {
-            // Data will be transferred directly from GPU.
-            rvec* d_f = reinterpret_cast<rvec*>(pme_gpu_get_device_f(&pme));
-            sendbuf   = reinterpret_cast<void*>(&d_f[ind_start]);
+            auto& receiver = pme_pp->ppRanks[i];
+            pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(
+                    receiver.rankId, receiver.numAtoms, pme_pp->sendForcesDirectToPpGpu);
+        }
+    }
+    else
+    {
+        for (const auto& receiver : pme_pp->ppRanks)
+        {
+            ind_start = ind_end;
+            ind_end   = ind_start + receiver.numAtoms;
+            if (pme_pp->useGpuDirectComm)
+            {
+                pme_pp->pmeForceSenderGpu->sendFToPpCudaMpi(pme_gpu_get_device_f(&pme),
+                                                            ind_start,
+                                                            receiver.numAtoms * sizeof(rvec),
+                                                            receiver.rankId,
+                                                            &pme_pp->req[messages]);
+            }
+            else
+            {
+                void* sendbuf = const_cast<void*>(static_cast<const void*>(output.forces_[ind_start]));
+                // Send using MPI
+                MPI_Isend(sendbuf,
+                          receiver.numAtoms * sizeof(rvec),
+                          MPI_BYTE,
+                          receiver.rankId,
+                          0,
+                          pme_pp->mpi_comm_mysim,
+                          &pme_pp->req[messages]);
+            }
+            messages++;
          }
-        sendFToPP(sendbuf, receiver, pme_pp, &messages);
      }
  
      /* send virial and energy to our last PP node */
@@ -578,13 +612,12 @@ static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
      {
          fprintf(debug, "PME rank sending to PP rank %d: virial and energy\n", pme_pp->peerRankId);
      }
-    MPI_Isend(&cve, sizeof(cve), MPI_BYTE, pme_pp->peerRankId, 1, pme_pp->mpi_comm_mysim,
-              &pme_pp->req[messages++]);
+    MPI_Isend(&cve, sizeof(cve), MPI_BYTE, pme_pp->peerRankId, 1, pme_pp->mpi_comm_mysim, &pme_pp->req[messages++]);
  
      /* Wait for the forces to arrive */
      MPI_Waitall(messages, pme_pp->req.data(), pme_pp->stat.data());
  #else
-    gmx_call("MPI not enabled");
+    GMX_RELEASE_ASSERT(false, "Invalid call to gmx_pme_send_force_vir_ener");
      GMX_UNUSED_VALUE(pme);
      GMX_UNUSED_VALUE(pme_pp);
      GMX_UNUSED_VALUE(output);
@@ -594,25 +627,27 @@ static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
  #endif
  }
  
-int gmx_pmeonly(struct gmx_pme_t*         pme,
-                const t_commrec*          cr,
-                t_nrnb*                   mynrnb,
-                gmx_wallcycle*            wcycle,
-                gmx_walltime_accounting_t walltime_accounting,
-                t_inputrec*               ir,
-                PmeRunMode                runMode)
+int gmx_pmeonly(struct gmx_pme_t*               pme,
+                const t_commrec*                cr,
+                t_nrnb*                         mynrnb,
+                gmx_wallcycle*                  wcycle,
+                gmx_walltime_accounting_t       walltime_accounting,
+                t_inputrec*                     ir,
+                PmeRunMode                      runMode,
+                bool                            useGpuPmePpCommunication,
+                const gmx::DeviceStreamManager* deviceStreamManager)
  {
-    int      ret;
-    int      natoms = 0;
-    matrix   box;
-    real     lambda_q   = 0;
-    real     lambda_lj  = 0;
-    int      maxshift_x = 0, maxshift_y = 0;
-    real     dvdlambda_q, dvdlambda_lj;
-    float    cycles;
-    int      count;
-    gmx_bool bEnerVir = FALSE;
-    int64_t  step;
+    int     ret;
+    int     natoms = 0;
+    matrix  box;
+    real    lambda_q   = 0;
+    real    lambda_lj  = 0;
+    int     maxshift_x = 0, maxshift_y = 0;
+    real    dvdlambda_q, dvdlambda_lj;
+    float   cycles;
+    int     count;
+    bool    computeEnergyAndVirial = false;
+    int64_t step;
  
      /* This data will only use with PME tuning, i.e. switching PME grids */
      std::vector<gmx_pme_t*> pmedata;
@@ -625,23 +660,31 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
      const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed);
      if (useGpuForPme)
      {
-        const void* commandStream = pme_gpu_get_device_stream(pme);
-        const void* deviceContext = pme_gpu_get_device_context(pme);
-
+        GMX_RELEASE_ASSERT(
+                deviceStreamManager != nullptr,
+                "Device stream manager can not be nullptr when using GPU in PME-only rank.");
+        GMX_RELEASE_ASSERT(deviceStreamManager->streamIsValid(gmx::DeviceStreamType::Pme),
+                           "Device stream can not be nullptr when using GPU in PME-only rank");
          changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
          changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
-        if (c_enableGpuPmePpComms)
+        if (useGpuPmePpCommunication)
          {
              pme_pp->pmeCoordinateReceiverGpu = std::make_unique<gmx::PmeCoordinateReceiverGpu>(
-                    commandStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
-            pme_pp->pmeForceSenderGpu = std::make_unique<gmx::PmeForceSenderGpu>(
-                    commandStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
+                    pme_pp->mpi_comm_mysim, deviceStreamManager->context(), pme_pp->ppRanks);
+            pme_pp->pmeForceSenderGpu =
+                    std::make_unique<gmx::PmeForceSenderGpu>(pme_gpu_get_f_ready_synchronizer(pme),
+                                                             pme_pp->mpi_comm_mysim,
+                                                             deviceStreamManager->context(),
+                                                             pme_pp->ppRanks);
          }
          // TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here.
          //       This should be made safer.
          stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
-                commandStream, deviceContext, GpuApiCallBehavior::Async,
-                pme_gpu_get_padding_size(pme), wcycle);
+                &deviceStreamManager->stream(gmx::DeviceStreamType::Pme),
+                deviceStreamManager->context(),
+                GpuApiCallBehavior::Async,
+                pme_gpu_get_block_size(pme),
+                wcycle);
      }
  
      clear_nrnb(mynrnb);
@@ -655,10 +698,22 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
              /* Domain decomposition */
              ivec newGridSize;
              real ewaldcoeff_q = 0, ewaldcoeff_lj = 0;
-            ret = gmx_pme_recv_coeffs_coords(pme, pme_pp.get(), &natoms, box, &maxshift_x,
-                                             &maxshift_y, &lambda_q, &lambda_lj, &bEnerVir, &step,
-                                             &newGridSize, &ewaldcoeff_q, &ewaldcoeff_lj,
-                                             useGpuForPme, stateGpu.get(), runMode);
+            ret = gmx_pme_recv_coeffs_coords(pme,
+                                             pme_pp.get(),
+                                             &natoms,
+                                             box,
+                                             &maxshift_x,
+                                             &maxshift_y,
+                                             &lambda_q,
+                                             &lambda_lj,
+                                             &computeEnergyAndVirial,
+                                             &step,
+                                             &newGridSize,
+                                             &ewaldcoeff_q,
+                                             &ewaldcoeff_lj,
+                                             useGpuForPme,
+                                             stateGpu.get(),
+                                             runMode);
  
              if (ret == pmerecvqxSWITCHGRID)
              {
@@ -681,11 +736,11 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
  
          if (count == 0)
          {
-            wallcycle_start(wcycle, ewcRUN);
+            wallcycle_start(wcycle, WallCycleCounter::Run);
              walltime_accounting_start_time(walltime_accounting);
          }
  
-        wallcycle_start(wcycle, ewcPMEMESH);
+        wallcycle_start(wcycle, WallCycleCounter::PmeMesh);
  
          dvdlambda_q  = 0;
          dvdlambda_lj = 0;
@@ -695,10 +750,10 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
          // from mdatoms for the other call to gmx_pme_do), so we have
          // fewer lines of code and less parameter passing.
          gmx::StepWorkload stepWork;
-        stepWork.computeVirial = bEnerVir;
-        stepWork.computeEnergy = bEnerVir;
+        stepWork.computeVirial = computeEnergyAndVirial;
+        stepWork.computeEnergy = computeEnergyAndVirial;
          stepWork.computeForces = true;
-        PmeOutput output;
+        PmeOutput output       = { {}, false, 0, { { 0 } }, 0, 0, { { 0 } }, 0 };
          if (useGpuForPme)
          {
              stepWork.haveDynamicBox      = false;
@@ -708,16 +763,22 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
              pme_gpu_prepare_computation(pme, box, wcycle, stepWork);
              if (!pme_pp->useGpuDirectComm)
              {
-                stateGpu->copyCoordinatesToGpu(gmx::ArrayRef<gmx::RVec>(pme_pp->x), gmx::AtomLocality::All);
+                stateGpu->copyCoordinatesToGpu(gmx::ArrayRef<gmx::RVec>(pme_pp->x),
+                                               gmx::AtomLocality::Local);
              }
              // On the separate PME rank we do not need a synchronizer as we schedule everything in a single stream
              // TODO: with pme on GPU the receive should make a list of synchronizers and pass it here #3157
              auto xReadyOnDevice = nullptr;
  
-            pme_gpu_launch_spread(pme, xReadyOnDevice, wcycle);
+            pme_gpu_launch_spread(pme,
+                                  xReadyOnDevice,
+                                  wcycle,
+                                  lambda_q,
+                                  pme_pp->useGpuDirectComm,
+                                  pme_pp->pmeCoordinateReceiverGpu.get());
              pme_gpu_launch_complex_transforms(pme, wcycle, stepWork);
-            pme_gpu_launch_gather(pme, wcycle);
-            output = pme_gpu_wait_finish_task(pme, bEnerVir, wcycle);
+            pme_gpu_launch_gather(pme, wcycle, lambda_q);
+            output = pme_gpu_wait_finish_task(pme, computeEnergyAndVirial, lambda_q, wcycle);
              pme_gpu_reinit_computation(pme, wcycle);
          }
          else
@@ -725,16 +786,34 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
              GMX_ASSERT(pme_pp->x.size() == static_cast<size_t>(natoms),
                         "The coordinate buffer should have size natoms");
  
-            gmx_pme_do(pme, pme_pp->x, pme_pp->f, pme_pp->chargeA.data(), pme_pp->chargeB.data(),
-                       pme_pp->sqrt_c6A.data(), pme_pp->sqrt_c6B.data(), pme_pp->sigmaA.data(),
-                       pme_pp->sigmaB.data(), box, cr, maxshift_x, maxshift_y, mynrnb, wcycle,
-                       output.coulombVirial_, output.lennardJonesVirial_, &output.coulombEnergy_,
-                       &output.lennardJonesEnergy_, lambda_q, lambda_lj, &dvdlambda_q,
-                       &dvdlambda_lj, stepWork);
+            gmx_pme_do(pme,
+                       pme_pp->x,
+                       pme_pp->f,
+                       pme_pp->chargeA,
+                       pme_pp->chargeB,
+                       pme_pp->sqrt_c6A,
+                       pme_pp->sqrt_c6B,
+                       pme_pp->sigmaA,
+                       pme_pp->sigmaB,
+                       box,
+                       cr,
+                       maxshift_x,
+                       maxshift_y,
+                       mynrnb,
+                       wcycle,
+                       output.coulombVirial_,
+                       output.lennardJonesVirial_,
+                       &output.coulombEnergy_,
+                       &output.lennardJonesEnergy_,
+                       lambda_q,
+                       lambda_lj,
+                       &dvdlambda_q,
+                       &dvdlambda_lj,
+                       stepWork);
              output.forces_ = pme_pp->f;
          }
  
-        cycles = wallcycle_stop(wcycle, ewcPMEMESH);
+        cycles = wallcycle_stop(wcycle, WallCycleCounter::PmeMesh);
          gmx_pme_send_force_vir_ener(*pme, pme_pp.get(), output, dvdlambda_q, dvdlambda_lj, cycles);
  
          count++;