Improve handling of PME GPU force buffer
authorMark Abraham <mark.j.abraham@gmail.com>
Mon, 6 Nov 2017 08:28:01 +0000 (09:28 +0100)
committerMark Abraham <mark.j.abraham@gmail.com>
Fri, 17 Nov 2017 16:42:29 +0000 (17:42 +0100)
Managed it with the HostAllocator, and moved the responsibility
for its lifetime to the PME GPU staging structure. The buffer
does not use CUDA pinning yet.

Change-Id: Ia6fdbdb2509137fec1c6cf2a4ac8c04b1696b58f

12 files changed:
src/gromacs/ewald/pme-gather.cu
src/gromacs/ewald/pme-gpu-internal.cpp
src/gromacs/ewald/pme-gpu-internal.h
src/gromacs/ewald/pme-gpu-types.h
src/gromacs/ewald/pme-gpu.cpp
src/gromacs/ewald/pme-only.cpp
src/gromacs/ewald/pme.cu
src/gromacs/ewald/pme.h
src/gromacs/ewald/tests/pmetestcommon.cpp
src/gromacs/mdlib/forcerec.cpp
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdtypes/forcerec.h

index 85f4309730902f03e1dd82f5ed74fbbf2c69eda5..b5acaa50b3262ce33fd829163dd5dd734a8ac9f9 100644 (file)
@@ -410,8 +410,7 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
     }
 }
 
-void pme_gpu_gather(const PmeGpu          *pmeGpu,
-                    float                 *h_forces,
+void pme_gpu_gather(PmeGpu                *pmeGpu,
                     PmeForceOutputHandling forceTreatment,
                     const float           *h_grid
                     )
@@ -419,7 +418,7 @@ void pme_gpu_gather(const PmeGpu          *pmeGpu,
     /* Copying the input CPU forces for reduction */
     if (forceTreatment != PmeForceOutputHandling::Set)
     {
-        pme_gpu_copy_input_forces(pmeGpu, h_forces);
+        pme_gpu_copy_input_forces(pmeGpu);
     }
 
     cudaStream_t stream          = pmeGpu->archSpecific->pmeStream;
@@ -468,5 +467,5 @@ void pme_gpu_gather(const PmeGpu          *pmeGpu,
     CU_LAUNCH_ERR("pme_gather_kernel");
     pme_gpu_stop_timing(pmeGpu, gtPME_GATHER);
 
-    pme_gpu_copy_output_forces(pmeGpu, h_forces);
+    pme_gpu_copy_output_forces(pmeGpu);
 }
index d3b27c3e8f6363d42185263ad7e73c9aa0546128..0844da4eefa40145272d86c5db5d39192689d9b6 100644 (file)
@@ -77,6 +77,11 @@ static PmeGpuKernelParamsBase *pme_gpu_get_kernel_params_base_ptr(const PmeGpu *
     return kernelParamsPtr;
 }
 
+gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *pmeGPU)
+{
+    return pmeGPU->staging.h_forces;
+}
+
 void pme_gpu_get_energy_virial(const PmeGpu *pmeGPU, real *energy, matrix virial)
 {
     for (int j = 0; j < c_virialAndEnergyCount; j++)
index 6d1b88414e334144480400b52d5cd7d50bd76c01..be50ead6afc23a28f83b146608752f9a691cf98b 100644 (file)
@@ -48,6 +48,7 @@
 
 #include "gromacs/fft/fft.h"                   // for the gmx_fft_direction enum
 #include "gromacs/gpu_utils/gpu_macros.h"      // for the CUDA_FUNC_ macros
+#include "gromacs/utility/arrayref.h"
 
 #include "pme-gpu-types.h"                     // for the inline functions accessing PmeGpu members
 
@@ -173,7 +174,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_free_bspline_values(const PmeGpu *CUDA_FUNC_ARG
  *
  * \param[in] pmeGPU             The PME GPU structure.
  */
-CUDA_FUNC_QUALIFIER void pme_gpu_realloc_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 
 /*! \libinternal \brief
  * Frees the GPU buffer for the PME forces.
@@ -187,19 +188,15 @@ CUDA_FUNC_QUALIFIER void pme_gpu_free_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pm
  * To be called e.g. after the bonded calculations.
  *
  * \param[in] pmeGPU             The PME GPU structure.
- * \param[in] h_forces           The input forces rvec buffer.
  */
-CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGPU),
-                                                   const float     *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 
 /*! \libinternal \brief
  * Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.
  *
  * \param[in] pmeGPU             The PME GPU structure.
- * \param[out] h_forces          The output forces rvec buffer.
  */
-CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGPU),
-                                                    float           *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
 
 /*! \libinternal \brief
  * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
@@ -473,13 +470,11 @@ CUDA_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGp
  * A GPU force gathering function.
  *
  * \param[in]     pmeGpu           The PME GPU structure.
- * \param[in,out] h_forces         The host buffer with input and output forces.
  * \param[in]     forceTreatment   Tells how data in h_forces should be treated.
  *                                 TODO: determine efficiency/balance of host/device-side reductions.
  * \param[in]     h_grid           The host-side grid buffer (used only in testing mode)
  */
-CUDA_FUNC_QUALIFIER void pme_gpu_gather(const PmeGpu          *CUDA_FUNC_ARGUMENT(pmeGpu),
-                                        float                 *CUDA_FUNC_ARGUMENT(h_forces),
+CUDA_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu                *CUDA_FUNC_ARGUMENT(pmeGpu),
                                         PmeForceOutputHandling CUDA_FUNC_ARGUMENT(forceTreatment),
                                         const float           *CUDA_FUNC_ARGUMENT(h_grid)
                                         ) CUDA_FUNC_TERM
@@ -569,6 +564,14 @@ gmx_inline bool pme_gpu_is_testing(const PmeGpu *pmeGPU)
 
 /* A block of C++ functions that live in pme-gpu-internal.cpp */
 
+/*! \libinternal \brief
+ * Returns the GPU gathering staging forces buffer.
+ *
+ * \param[in] pmeGPU             The PME GPU structure.
+ * \returns                      The input/output forces.
+ */
+gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *pmeGPU);
+
 /*! \libinternal \brief
  * Returns the output virial and energy of the PME solving.
  * Should be called after pme_gpu_finish_computation.
index 4e768410b79872387ad44947cdc02bc989e67bc1..d96d8d434b41a6db9c94c1c3e330c257d8f01266 100644 (file)
@@ -60,6 +60,7 @@
 
 #include "gromacs/ewald/pme.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/gpu_utils/hostallocator.h"
 #include "gromacs/math/vectypes.h"
 #include "gromacs/utility/basedefinitions.h"
 
@@ -256,6 +257,10 @@ struct PmeGpuSettings
  */
 struct PmeGpuStaging
 {
+    //TODO pin me with whatever method we settle on
+    //! Host-side force buffer
+    std::vector < gmx::RVec, gmx::HostAllocator < gmx::RVec>> h_forces;
+
     /*! \brief Virial and energy intermediate host-side buffer. Size is PME_GPU_VIRIAL_AND_ENERGY_COUNT. */
     float  *h_virialAndEnergy;
     /*! \brief B-spline values intermediate host-side buffer. */
index fbaadf7a9e04f1b552ed3a69beb3ac9de5b37cbc..0e270a4bf3f6a2484ac5778fa086b7570fa0e2f3 100644 (file)
@@ -298,7 +298,6 @@ void pme_gpu_launch_complex_transforms(gmx_pme_t      *pme,
 
 void pme_gpu_launch_gather(const gmx_pme_t                 *pme,
                            gmx_wallcycle_t gmx_unused       wcycle,
-                           rvec                            *forces,
                            PmeForceOutputHandling           forceTreatment)
 {
     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
@@ -312,15 +311,17 @@ void pme_gpu_launch_gather(const gmx_pme_t                 *pme,
     wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
     const unsigned int gridIndex  = 0;
     real              *fftgrid    = pme->fftgrid[gridIndex];
-    pme_gpu_gather(pme->gpu, reinterpret_cast<float *>(forces), forceTreatment, reinterpret_cast<float *>(fftgrid));
+    pme_gpu_gather(pme->gpu, forceTreatment, reinterpret_cast<float *>(fftgrid));
     wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
     wallcycle_stop(wcycle, ewcLAUNCH_GPU);
 }
 
-void pme_gpu_wait_for_gpu(const gmx_pme_t *pme,
-                          gmx_wallcycle_t  wcycle,
-                          matrix           vir_q,
-                          real            *energy_q)
+void
+pme_gpu_wait_for_gpu(const gmx_pme_t                *pme,
+                     gmx_wallcycle_t                 wcycle,
+                     gmx::ArrayRef<const gmx::RVec> *forces,
+                     matrix                          virial,
+                     real                           *energy)
 {
     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 
@@ -330,23 +331,17 @@ void pme_gpu_wait_for_gpu(const gmx_pme_t *pme,
     pme_gpu_finish_computation(pme->gpu);
     wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
 
+    *forces = pme_gpu_get_forces(pme->gpu);
+
     if (haveComputedEnergyAndVirial)
     {
-        if (pme->doCoulomb)
+        if (pme_gpu_performs_solve(pme->gpu))
         {
-            if (pme_gpu_performs_solve(pme->gpu))
-            {
-                pme_gpu_get_energy_virial(pme->gpu, energy_q, vir_q);
-            }
-            else
-            {
-                get_pme_ener_vir_q(pme->solve_work, pme->nthread, energy_q, vir_q);
-            }
+            pme_gpu_get_energy_virial(pme->gpu, energy, virial);
         }
         else
         {
-            *energy_q = 0;
+            get_pme_ener_vir_q(pme->solve_work, pme->nthread, energy, virial);
         }
     }
-    /* No additional haveComputedForces code since forces are copied to the output host buffer with no transformation. */
 }
index 88051eef0d321a8e6fbbe8ee76e9e62d4ef6d2d2..9c81b47cb27d96ad35336b15ebd408675259cffd 100644 (file)
@@ -461,6 +461,7 @@ static int gmx_pme_recv_coeffs_coords(gmx_pme_pp        *pme_pp,
 
 /*! \brief Send the PME mesh force, virial and energy to the PP-only ranks. */
 static void gmx_pme_send_force_vir_ener(gmx_pme_pp *pme_pp,
+                                        const rvec *f,
                                         matrix vir_q, real energy_q,
                                         matrix vir_lj, real energy_lj,
                                         real dvdlambda_q, real dvdlambda_lj,
@@ -478,7 +479,8 @@ static void gmx_pme_send_force_vir_ener(gmx_pme_pp *pme_pp,
     {
         ind_start = ind_end;
         ind_end   = ind_start + receiver.numAtoms;
-        if (MPI_Isend(pme_pp->f[ind_start], (ind_end-ind_start)*sizeof(rvec), MPI_BYTE,
+        if (MPI_Isend(const_cast<void *>(static_cast<const void *>(f[ind_start])),
+                      (ind_end-ind_start)*sizeof(rvec), MPI_BYTE,
                       receiver.rankId, 0,
                       pme_pp->mpi_comm_mysim, &pme_pp->req[messages++]) != 0)
         {
@@ -512,6 +514,7 @@ static void gmx_pme_send_force_vir_ener(gmx_pme_pp *pme_pp,
 #else
     gmx_call("MPI not enabled");
     GMX_UNUSED_VALUE(pme_pp);
+    GMX_UNUSED_VALUE(f);
     GMX_UNUSED_VALUE(vir_q);
     GMX_UNUSED_VALUE(energy_q);
     GMX_UNUSED_VALUE(vir_lj);
@@ -616,6 +619,7 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
         // from mdatoms for the other call to gmx_pme_do), so we have
         // fewer lines of code and less parameter passing.
         const int pmeFlags = GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0);
+        gmx::ArrayRef<const gmx::RVec> forces;
         if (runMode != PmeRunMode::CPU)
         {
             const bool boxChanged = true;
@@ -624,8 +628,8 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
             pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags);
             pme_gpu_launch_spread(pme, as_rvec_array(pme_pp->x.data()), wcycle);
             pme_gpu_launch_complex_transforms(pme, wcycle);
-            pme_gpu_launch_gather(pme, wcycle, as_rvec_array(pme_pp->f.data()), PmeForceOutputHandling::Set);
-            pme_gpu_wait_for_gpu(pme, wcycle, vir_q, &energy_q);
+            pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
+            pme_gpu_wait_for_gpu(pme, wcycle, &forces, vir_q, &energy_q);
         }
         else
         {
@@ -637,11 +641,12 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
                        vir_q, vir_lj,
                        &energy_q, &energy_lj, lambda_q, lambda_lj, &dvdlambda_q, &dvdlambda_lj,
                        pmeFlags);
+            forces = pme_pp->f;
         }
 
         cycles = wallcycle_stop(wcycle, ewcPMEMESH);
 
-        gmx_pme_send_force_vir_ener(pme_pp.get(),
+        gmx_pme_send_force_vir_ener(pme_pp.get(), as_rvec_array(forces.data()),
                                     vir_q, energy_q, vir_lj, energy_lj,
                                     dvdlambda_q, dvdlambda_lj, cycles);
 
index a948730cde46e377cebe3943b7c8452b68eb48c3..c7e3bc71a41a98647e456a9d67b784a217555566 100644 (file)
@@ -137,12 +137,14 @@ void pme_gpu_free_bspline_values(const PmeGpu *pmeGPU)
                      &pmeGPU->archSpecific->splineValuesSizeAlloc);
 }
 
-void pme_gpu_realloc_forces(const PmeGpu *pmeGPU)
+void pme_gpu_realloc_forces(PmeGpu *pmeGPU)
 {
     const size_t newForcesSize = pmeGPU->nAtomsAlloc * DIM;
     GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
     cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_forces, nullptr, sizeof(float),
                         &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc, newForcesSize, pmeGPU->archSpecific->pmeStream, true);
+    pmeGPU->staging.h_forces.reserve(pmeGPU->nAtomsAlloc);
+    pmeGPU->staging.h_forces.resize(pmeGPU->kernelParams->atoms.nAtoms);
 }
 
 void pme_gpu_free_forces(const PmeGpu *pmeGPU)
@@ -150,20 +152,18 @@ void pme_gpu_free_forces(const PmeGpu *pmeGPU)
     cu_free_buffered(pmeGPU->kernelParams->atoms.d_forces, &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc);
 }
 
-void pme_gpu_copy_input_forces(const PmeGpu *pmeGPU, const float *h_forces)
+void pme_gpu_copy_input_forces(PmeGpu *pmeGPU)
 {
-    GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
     const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
     GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
-    cu_copy_H2D(pmeGPU->kernelParams->atoms.d_forces, const_cast<float *>(h_forces), forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
+    cu_copy_H2D(pmeGPU->kernelParams->atoms.d_forces, pmeGPU->staging.h_forces.data(), forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
 }
 
-void pme_gpu_copy_output_forces(const PmeGpu *pmeGPU, float *h_forces)
+void pme_gpu_copy_output_forces(PmeGpu *pmeGPU)
 {
-    GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
     const size_t forcesSize   = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
     GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
-    cu_copy_D2H(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
+    cu_copy_D2H(pmeGPU->staging.h_forces.data(), pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
 }
 
 void pme_gpu_realloc_coordinates(const PmeGpu *pmeGPU)
index e82ebb04bfac403729df095004488e2ab5b31329..571f21d4e735ee9a76a9b7b57ee95a51e8b4c7b7 100644 (file)
@@ -53,6 +53,7 @@
 #include "gromacs/math/vectypes.h"
 #include "gromacs/timing/wallcycle.h"
 #include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/basedefinitions.h"
 #include "gromacs/utility/real.h"
 
@@ -320,14 +321,12 @@ void pme_gpu_launch_complex_transforms(gmx_pme_t       *pme,
  *
  * \param[in]  pme               The PME data structure.
  * \param[in]  wcycle            The wallclock counter.
- * \param[in,out] forces         The array of local atoms' resulting forces.
- * \param[in]  forceTreatment    Tells how data in h_forces should be treated. The gathering kernel either stores
+ * \param[in]  forceTreatment    Tells how data should be treated. The gathering kernel either stores
  *                               the output reciprocal forces into the host array, or copies its contents to the GPU first
  *                               and accumulates. The reduction is non-atomic.
  */
 void pme_gpu_launch_gather(const gmx_pme_t        *pme,
                            gmx_wallcycle_t         wcycle,
-                           rvec                   *forces,
                            PmeForceOutputHandling  forceTreatment);
 
 /*! \brief
@@ -335,13 +334,15 @@ void pme_gpu_launch_gather(const gmx_pme_t        *pme,
  * (if they were to be computed).
  *
  * \param[in]  pme            The PME data structure.
- * \param[in]  wcycle         The wallclock counter.
- * \param[out] vir_q          The output virial matrix.
- * \param[out] energy_q       The output energy.
+ * \param[out] wcycle         The wallclock counter.
+ * \param[out] forces         The output forces.
+ * \param[out] virial         The output virial matrix.
+ * \param[out] energy         The output energy.
  */
-void pme_gpu_wait_for_gpu(const gmx_pme_t *pme,
-                          gmx_wallcycle_t  wcycle,
-                          matrix           vir_q,
-                          real            *energy_q);
+void pme_gpu_wait_for_gpu(const gmx_pme_t                *pme,
+                          gmx_wallcycle_t                 wcycle,
+                          gmx::ArrayRef<const gmx::RVec> *forces,
+                          matrix                          virial,
+                          real                           *energy);
 
 #endif
index b16dbf4b0f4609d10ca84d0b2f2caa633cc9f7ab..f7c7b0f141f48f9b660da98ac8e99321085d5d70 100644 (file)
@@ -388,8 +388,24 @@ void pmePerformGather(gmx_pme_t *pme, CodePath mode,
             break;
 
         case CodePath::CUDA:
-            pme_gpu_gather(pme->gpu, reinterpret_cast<float *>(forces.begin()), inputTreatment, reinterpret_cast<float *>(fftgrid));
-            break;
+        {
+            // Variable initialization needs a non-switch scope
+            auto stagingForces = pme_gpu_get_forces(pme->gpu);
+            GMX_ASSERT(forces.size() == stagingForces.size(), "Size of force buffers did not match");
+            if (forceReductionWithInput)
+            {
+                for (size_t i = 0; i != forces.size(); ++i)
+                {
+                    stagingForces[i] = forces[i];
+                }
+            }
+            pme_gpu_gather(pme->gpu, inputTreatment, reinterpret_cast<float *>(fftgrid));
+            for (size_t i = 0; i != forces.size(); ++i)
+            {
+                forces[i] = stagingForces[i];
+            }
+        }
+        break;
 
         default:
             GMX_THROW(InternalError("Test not implemented for this mode"));
index f4889db728dee3da4e54c9c44097d44173a3f0f6..05c59c2f2937752d83b48a44dfdab5bb5191519e 100644 (file)
@@ -1514,11 +1514,6 @@ void forcerec_set_ranges(t_forcerec *fr,
     {
         fr->forceBufferForDirectVirialContributions->resize(natoms_f_novirsum);
     }
-
-    if (fr->ic->cutoff_scheme == ecutsVERLET)
-    {
-        fr->forceBufferIntermediate->resize(ncg_home);
-    }
 }
 
 static real cutoff_inf(real cutoff)
@@ -2793,8 +2788,6 @@ void init_forcerec(FILE                *fp,
         fr->forceBufferForDirectVirialContributions = new std::vector<gmx::RVec>;
     }
 
-    fr->forceBufferIntermediate = new std::vector<gmx::RVec>; //TODO add proper conditionals
-
     if (fr->cutoff_scheme == ecutsGROUP &&
         ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr))
     {
index 2f11ed7c76d28cf26ca061dbcc0438a4a778d189..7f625a3b0dd3d51a4ad608ec9b2a8681c113e4a0 100644 (file)
@@ -829,15 +829,13 @@ static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
  * This function only implements setting the output forces (no accumulation).
  *
  * \param[in]  pmedata        The PME structure
- * \param[out] pmeGpuForces   The array of where the output forces are copied
  * \param[in]  wcycle         The wallcycle structure
  */
 static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
-                                     ArrayRef<RVec>    pmeGpuForces,
                                      gmx_wallcycle_t   wcycle)
 {
     pme_gpu_launch_complex_transforms(pmedata, wcycle);
-    pme_gpu_launch_gather(pmedata, wcycle, as_rvec_array(pmeGpuForces.data()), PmeForceOutputHandling::Set);
+    pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
 }
 
 static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
@@ -879,8 +877,6 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
     // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
     const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
         ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Hybrid));
-    // a comment for uncrustify
-    const ArrayRef<RVec> pmeGpuForces = *fr->forceBufferIntermediate;
 
     /* At a search step we need to start the first balancing region
      * somewhere early inside the step after communication during domain
@@ -1075,7 +1071,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         // X copy/transform to allow overlap.
         // Note that this is advantageous for the case where NB and PME
         // tasks run on the same device, but may not be ideal otherwise.
-        launchPmeGpuFftAndGather(fr->pmedata, pmeGpuForces, wcycle);
+        launchPmeGpuFftAndGather(fr->pmedata, wcycle);
     }
 
     if (bUseGPU)
@@ -1100,7 +1096,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
         // PME GPU - intermediate CPU work in mixed mode
         // TODO - move this below till after do_force_lowlevel() / special forces?
         //        (to allow overlap of spread/drid D2H with some CPU work)
-        launchPmeGpuFftAndGather(fr->pmedata, pmeGpuForces, wcycle);
+        launchPmeGpuFftAndGather(fr->pmedata, wcycle);
     }
 
     /* Communicate coordinates and sum dipole if necessary +
@@ -1417,13 +1413,11 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
 
     if (useGpuPme)
     {
+        gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
         matrix vir_Q;
         real   Vlr_q;
-        pme_gpu_wait_for_gpu(fr->pmedata, wcycle, vir_Q, &Vlr_q);
-
-        pme_gpu_reduce_outputs(wcycle, &forceWithVirial,
-                               pmeGpuForces,
-                               enerd, vir_Q, Vlr_q);
+        pme_gpu_wait_for_gpu(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
+        pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
     }
 
     if (bUseOrEmulGPU)
index cea244b21c65cb379d1053ab544514f737556225..ce3b8b341f39fc2eb3c401855e0ae9bb719d6c88 100644 (file)
@@ -292,14 +292,9 @@ struct t_forcerec {
     gmx_bool          haveDirectVirialContributions;
 #ifdef __cplusplus
     /* TODO: Replace the pointer by an object once we got rid of C */
-    std::vector<gmx::RVec> *forceBufferForDirectVirialContributions;
-    /* This buffer is currently only used for storing the PME GPU output until reduction.
-     * TODO: Pagelock/pin it
-     * TODO: Replace the pointer by an object once we got rid of C */
-    std::vector<gmx::RVec>  *forceBufferIntermediate;
+    std::vector<gmx::RVec>  *forceBufferForDirectVirialContributions;
 #else
     void                    *forceBufferForDirectVirialContributions_dummy;
-    void                    *forceBufferIntermediate_dummy;
 #endif
 
     /* Data for PPPM/PME/Ewald */