Improve handling of PME GPU force buffer

author Mark Abraham <mark.j.abraham@gmail.com>

Mon, 6 Nov 2017 08:28:01 +0000 (09:28 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Fri, 17 Nov 2017 16:42:29 +0000 (17:42 +0100)
author Mark Abraham <mark.j.abraham@gmail.com>
Mon, 6 Nov 2017 08:28:01 +0000 (09:28 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Fri, 17 Nov 2017 16:42:29 +0000 (17:42 +0100)
diff --git a/src/gromacs/ewald/pme-gather.cu b/src/gromacs/ewald/pme-gather.cu

index 85f4309730902f03e1dd82f5ed74fbbf2c69eda5..b5acaa50b3262ce33fd829163dd5dd734a8ac9f9 100644 (file)
--- a/src/gromacs/ewald/pme-gather.cu
+++ b/src/gromacs/ewald/pme-gather.cu
@@ -410,8 +410,7 @@ __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams    kernelParams)
      }
  }
  
-void pme_gpu_gather(const PmeGpu          *pmeGpu,
-                    float                 *h_forces,
+void pme_gpu_gather(PmeGpu                *pmeGpu,
                      PmeForceOutputHandling forceTreatment,
                      const float           *h_grid
                      )
@@ -419,7 +418,7 @@ void pme_gpu_gather(const PmeGpu          *pmeGpu,
      /* Copying the input CPU forces for reduction */
      if (forceTreatment != PmeForceOutputHandling::Set)
      {
-        pme_gpu_copy_input_forces(pmeGpu, h_forces);
+        pme_gpu_copy_input_forces(pmeGpu);
      }
  
      cudaStream_t stream          = pmeGpu->archSpecific->pmeStream;
@@ -468,5 +467,5 @@ void pme_gpu_gather(const PmeGpu          *pmeGpu,
      CU_LAUNCH_ERR("pme_gather_kernel");
      pme_gpu_stop_timing(pmeGpu, gtPME_GATHER);
  
-    pme_gpu_copy_output_forces(pmeGpu, h_forces);
+    pme_gpu_copy_output_forces(pmeGpu);
  }
diff --git a/src/gromacs/ewald/pme-gpu-internal.cpp b/src/gromacs/ewald/pme-gpu-internal.cpp

index d3b27c3e8f6363d42185263ad7e73c9aa0546128..0844da4eefa40145272d86c5db5d39192689d9b6 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.cpp
+++ b/src/gromacs/ewald/pme-gpu-internal.cpp
@@ -77,6 +77,11 @@ static PmeGpuKernelParamsBase *pme_gpu_get_kernel_params_base_ptr(const PmeGpu *
      return kernelParamsPtr;
  }
  
+gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *pmeGPU)
+{
+    return pmeGPU->staging.h_forces;
+}
+
  void pme_gpu_get_energy_virial(const PmeGpu *pmeGPU, real *energy, matrix virial)
  {
      for (int j = 0; j < c_virialAndEnergyCount; j++)
diff --git a/src/gromacs/ewald/pme-gpu-internal.h b/src/gromacs/ewald/pme-gpu-internal.h

index 6d1b88414e334144480400b52d5cd7d50bd76c01..be50ead6afc23a28f83b146608752f9a691cf98b 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-internal.h
+++ b/src/gromacs/ewald/pme-gpu-internal.h
@@ -48,6 +48,7 @@
  
  #include "gromacs/fft/fft.h"                   // for the gmx_fft_direction enum
  #include "gromacs/gpu_utils/gpu_macros.h"      // for the CUDA_FUNC_ macros
+#include "gromacs/utility/arrayref.h"
  
  #include "pme-gpu-types.h"                     // for the inline functions accessing PmeGpu members
  
@@ -173,7 +174,7 @@ CUDA_FUNC_QUALIFIER void pme_gpu_free_bspline_values(const PmeGpu *CUDA_FUNC_ARG
   *
   * \param[in] pmeGPU             The PME GPU structure.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_realloc_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
+CUDA_FUNC_QUALIFIER void pme_gpu_realloc_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
  
  /*! \libinternal \brief
   * Frees the GPU buffer for the PME forces.
@@ -187,19 +188,15 @@ CUDA_FUNC_QUALIFIER void pme_gpu_free_forces(const PmeGpu *CUDA_FUNC_ARGUMENT(pm
   * To be called e.g. after the bonded calculations.
   *
   * \param[in] pmeGPU             The PME GPU structure.
- * \param[in] h_forces           The input forces rvec buffer.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGPU),
-                                                   const float     *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_input_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
  
  /*! \libinternal \brief
   * Copies the forces from the GPU to the CPU buffer. To be called after the gathering stage.
   *
   * \param[in] pmeGPU             The PME GPU structure.
- * \param[out] h_forces          The output forces rvec buffer.
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGPU),
-                                                    float           *CUDA_FUNC_ARGUMENT(h_forces)) CUDA_FUNC_TERM
+CUDA_FUNC_QUALIFIER void pme_gpu_copy_output_forces(PmeGpu *CUDA_FUNC_ARGUMENT(pmeGPU)) CUDA_FUNC_TERM
  
  /*! \libinternal \brief
   * Reallocates the input coordinates buffer on the GPU (and clears the padded part if needed).
@@ -473,13 +470,11 @@ CUDA_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu    *CUDA_FUNC_ARGUMENT(pmeGp
   * A GPU force gathering function.
   *
   * \param[in]     pmeGpu           The PME GPU structure.
- * \param[in,out] h_forces         The host buffer with input and output forces.
   * \param[in]     forceTreatment   Tells how data in h_forces should be treated.
   *                                 TODO: determine efficiency/balance of host/device-side reductions.
   * \param[in]     h_grid           The host-side grid buffer (used only in testing mode)
   */
-CUDA_FUNC_QUALIFIER void pme_gpu_gather(const PmeGpu          *CUDA_FUNC_ARGUMENT(pmeGpu),
-                                        float                 *CUDA_FUNC_ARGUMENT(h_forces),
+CUDA_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu                *CUDA_FUNC_ARGUMENT(pmeGpu),
                                          PmeForceOutputHandling CUDA_FUNC_ARGUMENT(forceTreatment),
                                          const float           *CUDA_FUNC_ARGUMENT(h_grid)
                                          ) CUDA_FUNC_TERM
@@ -569,6 +564,14 @@ gmx_inline bool pme_gpu_is_testing(const PmeGpu *pmeGPU)
  
  /* A block of C++ functions that live in pme-gpu-internal.cpp */
  
+/*! \libinternal \brief
+ * Returns the GPU gathering staging forces buffer.
+ *
+ * \param[in] pmeGPU             The PME GPU structure.
+ * \returns                      The input/output forces.
+ */
+gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *pmeGPU);
+
  /*! \libinternal \brief
   * Returns the output virial and energy of the PME solving.
   * Should be called after pme_gpu_finish_computation.
diff --git a/src/gromacs/ewald/pme-gpu-types.h b/src/gromacs/ewald/pme-gpu-types.h

index 4e768410b79872387ad44947cdc02bc989e67bc1..d96d8d434b41a6db9c94c1c3e330c257d8f01266 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-types.h
+++ b/src/gromacs/ewald/pme-gpu-types.h
@@ -60,6 +60,7 @@
  
  #include "gromacs/ewald/pme.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/gpu_utils/hostallocator.h"
  #include "gromacs/math/vectypes.h"
  #include "gromacs/utility/basedefinitions.h"
  
@@ -256,6 +257,10 @@ struct PmeGpuSettings
   */
  struct PmeGpuStaging
  {
+    //TODO pin me with whatever method we settle on
+    //! Host-side force buffer
+    std::vector < gmx::RVec, gmx::HostAllocator < gmx::RVec>> h_forces;
+
      /*! \brief Virial and energy intermediate host-side buffer. Size is PME_GPU_VIRIAL_AND_ENERGY_COUNT. */
      float  *h_virialAndEnergy;
      /*! \brief B-spline values intermediate host-side buffer. */
diff --git a/src/gromacs/ewald/pme-gpu.cpp b/src/gromacs/ewald/pme-gpu.cpp

index fbaadf7a9e04f1b552ed3a69beb3ac9de5b37cbc..0e270a4bf3f6a2484ac5778fa086b7570fa0e2f3 100644 (file)
--- a/src/gromacs/ewald/pme-gpu.cpp
+++ b/src/gromacs/ewald/pme-gpu.cpp
@@ -298,7 +298,6 @@ void pme_gpu_launch_complex_transforms(gmx_pme_t      *pme,
  
  void pme_gpu_launch_gather(const gmx_pme_t                 *pme,
                             gmx_wallcycle_t gmx_unused       wcycle,
-                           rvec                            *forces,
                             PmeForceOutputHandling           forceTreatment)
  {
      GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
@@ -312,15 +311,17 @@ void pme_gpu_launch_gather(const gmx_pme_t                 *pme,
      wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
      const unsigned int gridIndex  = 0;
      real              *fftgrid    = pme->fftgrid[gridIndex];
-    pme_gpu_gather(pme->gpu, reinterpret_cast<float *>(forces), forceTreatment, reinterpret_cast<float *>(fftgrid));
+    pme_gpu_gather(pme->gpu, forceTreatment, reinterpret_cast<float *>(fftgrid));
      wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
      wallcycle_stop(wcycle, ewcLAUNCH_GPU);
  }
  
-void pme_gpu_wait_for_gpu(const gmx_pme_t *pme,
-                          gmx_wallcycle_t  wcycle,
-                          matrix           vir_q,
-                          real            *energy_q)
+void
+pme_gpu_wait_for_gpu(const gmx_pme_t                *pme,
+                     gmx_wallcycle_t                 wcycle,
+                     gmx::ArrayRef<const gmx::RVec> *forces,
+                     matrix                          virial,
+                     real                           *energy)
  {
      GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
  
@@ -330,23 +331,17 @@ void pme_gpu_wait_for_gpu(const gmx_pme_t *pme,
      pme_gpu_finish_computation(pme->gpu);
      wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
  
+    *forces = pme_gpu_get_forces(pme->gpu);
+
      if (haveComputedEnergyAndVirial)
      {
-        if (pme->doCoulomb)
+        if (pme_gpu_performs_solve(pme->gpu))
          {
-            if (pme_gpu_performs_solve(pme->gpu))
-            {
-                pme_gpu_get_energy_virial(pme->gpu, energy_q, vir_q);
-            }
-            else
-            {
-                get_pme_ener_vir_q(pme->solve_work, pme->nthread, energy_q, vir_q);
-            }
+            pme_gpu_get_energy_virial(pme->gpu, energy, virial);
          }
          else
          {
-            *energy_q = 0;
+            get_pme_ener_vir_q(pme->solve_work, pme->nthread, energy, virial);
          }
      }
-    /* No additional haveComputedForces code since forces are copied to the output host buffer with no transformation. */
  }
diff --git a/src/gromacs/ewald/pme-only.cpp b/src/gromacs/ewald/pme-only.cpp

index 88051eef0d321a8e6fbbe8ee76e9e62d4ef6d2d2..9c81b47cb27d96ad35336b15ebd408675259cffd 100644 (file)
--- a/src/gromacs/ewald/pme-only.cpp
+++ b/src/gromacs/ewald/pme-only.cpp
@@ -461,6 +461,7 @@ static int gmx_pme_recv_coeffs_coords(gmx_pme_pp        *pme_pp,
  
  /*! \brief Send the PME mesh force, virial and energy to the PP-only ranks. */
  static void gmx_pme_send_force_vir_ener(gmx_pme_pp *pme_pp,
+                                        const rvec *f,
                                          matrix vir_q, real energy_q,
                                          matrix vir_lj, real energy_lj,
                                          real dvdlambda_q, real dvdlambda_lj,
@@ -478,7 +479,8 @@ static void gmx_pme_send_force_vir_ener(gmx_pme_pp *pme_pp,
      {
          ind_start = ind_end;
          ind_end   = ind_start + receiver.numAtoms;
-        if (MPI_Isend(pme_pp->f[ind_start], (ind_end-ind_start)*sizeof(rvec), MPI_BYTE,
+        if (MPI_Isend(const_cast<void *>(static_cast<const void *>(f[ind_start])),
+                      (ind_end-ind_start)*sizeof(rvec), MPI_BYTE,
                        receiver.rankId, 0,
                        pme_pp->mpi_comm_mysim, &pme_pp->req[messages++]) != 0)
          {
@@ -512,6 +514,7 @@ static void gmx_pme_send_force_vir_ener(gmx_pme_pp *pme_pp,
  #else
      gmx_call("MPI not enabled");
      GMX_UNUSED_VALUE(pme_pp);
+    GMX_UNUSED_VALUE(f);
      GMX_UNUSED_VALUE(vir_q);
      GMX_UNUSED_VALUE(energy_q);
      GMX_UNUSED_VALUE(vir_lj);
@@ -616,6 +619,7 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
          // from mdatoms for the other call to gmx_pme_do), so we have
          // fewer lines of code and less parameter passing.
          const int pmeFlags = GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0);
+        gmx::ArrayRef<const gmx::RVec> forces;
          if (runMode != PmeRunMode::CPU)
          {
              const bool boxChanged = true;
@@ -624,8 +628,8 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
              pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags);
              pme_gpu_launch_spread(pme, as_rvec_array(pme_pp->x.data()), wcycle);
              pme_gpu_launch_complex_transforms(pme, wcycle);
-            pme_gpu_launch_gather(pme, wcycle, as_rvec_array(pme_pp->f.data()), PmeForceOutputHandling::Set);
-            pme_gpu_wait_for_gpu(pme, wcycle, vir_q, &energy_q);
+            pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
+            pme_gpu_wait_for_gpu(pme, wcycle, &forces, vir_q, &energy_q);
          }
          else
          {
@@ -637,11 +641,12 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
                         vir_q, vir_lj,
                         &energy_q, &energy_lj, lambda_q, lambda_lj, &dvdlambda_q, &dvdlambda_lj,
                         pmeFlags);
+            forces = pme_pp->f;
          }
  
          cycles = wallcycle_stop(wcycle, ewcPMEMESH);
  
-        gmx_pme_send_force_vir_ener(pme_pp.get(),
+        gmx_pme_send_force_vir_ener(pme_pp.get(), as_rvec_array(forces.data()),
                                      vir_q, energy_q, vir_lj, energy_lj,
                                      dvdlambda_q, dvdlambda_lj, cycles);
  
diff --git a/src/gromacs/ewald/pme.cu b/src/gromacs/ewald/pme.cu

index a948730cde46e377cebe3943b7c8452b68eb48c3..c7e3bc71a41a98647e456a9d67b784a217555566 100644 (file)
--- a/src/gromacs/ewald/pme.cu
+++ b/src/gromacs/ewald/pme.cu
@@ -137,12 +137,14 @@ void pme_gpu_free_bspline_values(const PmeGpu *pmeGPU)
                       &pmeGPU->archSpecific->splineValuesSizeAlloc);
  }
  
-void pme_gpu_realloc_forces(const PmeGpu *pmeGPU)
+void pme_gpu_realloc_forces(PmeGpu *pmeGPU)
  {
      const size_t newForcesSize = pmeGPU->nAtomsAlloc * DIM;
      GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
      cu_realloc_buffered((void **)&pmeGPU->kernelParams->atoms.d_forces, nullptr, sizeof(float),
                          &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc, newForcesSize, pmeGPU->archSpecific->pmeStream, true);
+    pmeGPU->staging.h_forces.reserve(pmeGPU->nAtomsAlloc);
+    pmeGPU->staging.h_forces.resize(pmeGPU->kernelParams->atoms.nAtoms);
  }
  
  void pme_gpu_free_forces(const PmeGpu *pmeGPU)
@@ -150,20 +152,18 @@ void pme_gpu_free_forces(const PmeGpu *pmeGPU)
      cu_free_buffered(pmeGPU->kernelParams->atoms.d_forces, &pmeGPU->archSpecific->forcesSize, &pmeGPU->archSpecific->forcesSizeAlloc);
  }
  
-void pme_gpu_copy_input_forces(const PmeGpu *pmeGPU, const float *h_forces)
+void pme_gpu_copy_input_forces(PmeGpu *pmeGPU)
  {
-    GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
      const size_t forcesSize = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
      GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
-    cu_copy_H2D(pmeGPU->kernelParams->atoms.d_forces, const_cast<float *>(h_forces), forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
+    cu_copy_H2D(pmeGPU->kernelParams->atoms.d_forces, pmeGPU->staging.h_forces.data(), forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
  }
  
-void pme_gpu_copy_output_forces(const PmeGpu *pmeGPU, float *h_forces)
+void pme_gpu_copy_output_forces(PmeGpu *pmeGPU)
  {
-    GMX_ASSERT(h_forces, "nullptr host forces pointer in PME GPU");
      const size_t forcesSize   = DIM * pmeGPU->kernelParams->atoms.nAtoms * sizeof(float);
      GMX_ASSERT(forcesSize > 0, "Bad number of atoms in PME GPU");
-    cu_copy_D2H(h_forces, pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
+    cu_copy_D2H(pmeGPU->staging.h_forces.data(), pmeGPU->kernelParams->atoms.d_forces, forcesSize, pmeGPU->settings.transferKind, pmeGPU->archSpecific->pmeStream);
  }
  
  void pme_gpu_realloc_coordinates(const PmeGpu *pmeGPU)
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h

index e82ebb04bfac403729df095004488e2ab5b31329..571f21d4e735ee9a76a9b7b57ee95a51e8b4c7b7 100644 (file)
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -53,6 +53,7 @@
  #include "gromacs/math/vectypes.h"
  #include "gromacs/timing/wallcycle.h"
  #include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/utility/arrayref.h"
  #include "gromacs/utility/basedefinitions.h"
  #include "gromacs/utility/real.h"
  
@@ -320,14 +321,12 @@ void pme_gpu_launch_complex_transforms(gmx_pme_t       *pme,
   *
   * \param[in]  pme               The PME data structure.
   * \param[in]  wcycle            The wallclock counter.
- * \param[in,out] forces         The array of local atoms' resulting forces.
- * \param[in]  forceTreatment    Tells how data in h_forces should be treated. The gathering kernel either stores
+ * \param[in]  forceTreatment    Tells how data should be treated. The gathering kernel either stores
   *                               the output reciprocal forces into the host array, or copies its contents to the GPU first
   *                               and accumulates. The reduction is non-atomic.
   */
  void pme_gpu_launch_gather(const gmx_pme_t        *pme,
                             gmx_wallcycle_t         wcycle,
-                           rvec                   *forces,
                             PmeForceOutputHandling  forceTreatment);
  
  /*! \brief
@@ -335,13 +334,15 @@ void pme_gpu_launch_gather(const gmx_pme_t        *pme,
   * (if they were to be computed).
   *
   * \param[in]  pme            The PME data structure.
- * \param[in]  wcycle         The wallclock counter.
- * \param[out] vir_q          The output virial matrix.
- * \param[out] energy_q       The output energy.
+ * \param[out] wcycle         The wallclock counter.
+ * \param[out] forces         The output forces.
+ * \param[out] virial         The output virial matrix.
+ * \param[out] energy         The output energy.
   */
-void pme_gpu_wait_for_gpu(const gmx_pme_t *pme,
-                          gmx_wallcycle_t  wcycle,
-                          matrix           vir_q,
-                          real            *energy_q);
+void pme_gpu_wait_for_gpu(const gmx_pme_t                *pme,
+                          gmx_wallcycle_t                 wcycle,
+                          gmx::ArrayRef<const gmx::RVec> *forces,
+                          matrix                          virial,
+                          real                           *energy);
  
  #endif
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp

index b16dbf4b0f4609d10ca84d0b2f2caa633cc9f7ab..f7c7b0f141f48f9b660da98ac8e99321085d5d70 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -388,8 +388,24 @@ void pmePerformGather(gmx_pme_t *pme, CodePath mode,
              break;
  
          case CodePath::CUDA:
-            pme_gpu_gather(pme->gpu, reinterpret_cast<float *>(forces.begin()), inputTreatment, reinterpret_cast<float *>(fftgrid));
-            break;
+        {
+            // Variable initialization needs a non-switch scope
+            auto stagingForces = pme_gpu_get_forces(pme->gpu);
+            GMX_ASSERT(forces.size() == stagingForces.size(), "Size of force buffers did not match");
+            if (forceReductionWithInput)
+            {
+                for (size_t i = 0; i != forces.size(); ++i)
+                {
+                    stagingForces[i] = forces[i];
+                }
+            }
+            pme_gpu_gather(pme->gpu, inputTreatment, reinterpret_cast<float *>(fftgrid));
+            for (size_t i = 0; i != forces.size(); ++i)
+            {
+                forces[i] = stagingForces[i];
+            }
+        }
+        break;
  
          default:
              GMX_THROW(InternalError("Test not implemented for this mode"));
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp

index f4889db728dee3da4e54c9c44097d44173a3f0f6..05c59c2f2937752d83b48a44dfdab5bb5191519e 100644 (file)
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -1514,11 +1514,6 @@ void forcerec_set_ranges(t_forcerec *fr,
      {
          fr->forceBufferForDirectVirialContributions->resize(natoms_f_novirsum);
      }
-
-    if (fr->ic->cutoff_scheme == ecutsVERLET)
-    {
-        fr->forceBufferIntermediate->resize(ncg_home);
-    }
  }
  
  static real cutoff_inf(real cutoff)
@@ -2793,8 +2788,6 @@ void init_forcerec(FILE                *fp,
          fr->forceBufferForDirectVirialContributions = new std::vector<gmx::RVec>;
      }
  
-    fr->forceBufferIntermediate = new std::vector<gmx::RVec>; //TODO add proper conditionals
-
      if (fr->cutoff_scheme == ecutsGROUP &&
          ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr))
      {
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 2f11ed7c76d28cf26ca061dbcc0438a4a778d189..7f625a3b0dd3d51a4ad608ec9b2a8681c113e4a0 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -829,15 +829,13 @@ static inline void launchPmeGpuSpread(gmx_pme_t      *pmedata,
   * This function only implements setting the output forces (no accumulation).
   *
   * \param[in]  pmedata        The PME structure
- * \param[out] pmeGpuForces   The array of where the output forces are copied
   * \param[in]  wcycle         The wallcycle structure
   */
  static void launchPmeGpuFftAndGather(gmx_pme_t        *pmedata,
-                                     ArrayRef<RVec>    pmeGpuForces,
                                       gmx_wallcycle_t   wcycle)
  {
      pme_gpu_launch_complex_transforms(pmedata, wcycle);
-    pme_gpu_launch_gather(pmedata, wcycle, as_rvec_array(pmeGpuForces.data()), PmeForceOutputHandling::Set);
+    pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
  }
  
  static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
@@ -879,8 +877,6 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
      // TODO slim this conditional down - inputrec and duty checks should mean the same in proper code!
      const bool useGpuPme  = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
          ((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Hybrid));
-    // a comment for uncrustify
-    const ArrayRef<RVec> pmeGpuForces = *fr->forceBufferIntermediate;
  
      /* At a search step we need to start the first balancing region
       * somewhere early inside the step after communication during domain
@@ -1075,7 +1071,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          // X copy/transform to allow overlap.
          // Note that this is advantageous for the case where NB and PME
          // tasks run on the same device, but may not be ideal otherwise.
-        launchPmeGpuFftAndGather(fr->pmedata, pmeGpuForces, wcycle);
+        launchPmeGpuFftAndGather(fr->pmedata, wcycle);
      }
  
      if (bUseGPU)
@@ -1100,7 +1096,7 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          // PME GPU - intermediate CPU work in mixed mode
          // TODO - move this below till after do_force_lowlevel() / special forces?
          //        (to allow overlap of spread/drid D2H with some CPU work)
-        launchPmeGpuFftAndGather(fr->pmedata, pmeGpuForces, wcycle);
+        launchPmeGpuFftAndGather(fr->pmedata, wcycle);
      }
  
      /* Communicate coordinates and sum dipole if necessary +
@@ -1417,13 +1413,11 @@ static void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
  
      if (useGpuPme)
      {
+        gmx::ArrayRef<const gmx::RVec> pmeGpuForces;
          matrix vir_Q;
          real   Vlr_q;
-        pme_gpu_wait_for_gpu(fr->pmedata, wcycle, vir_Q, &Vlr_q);
-
-        pme_gpu_reduce_outputs(wcycle, &forceWithVirial,
-                               pmeGpuForces,
-                               enerd, vir_Q, Vlr_q);
+        pme_gpu_wait_for_gpu(fr->pmedata, wcycle, &pmeGpuForces, vir_Q, &Vlr_q);
+        pme_gpu_reduce_outputs(wcycle, &forceWithVirial, pmeGpuForces, enerd, vir_Q, Vlr_q);
      }
  
      if (bUseOrEmulGPU)
diff --git a/src/gromacs/mdtypes/forcerec.h b/src/gromacs/mdtypes/forcerec.h

index cea244b21c65cb379d1053ab544514f737556225..ce3b8b341f39fc2eb3c401855e0ae9bb719d6c88 100644 (file)
--- a/src/gromacs/mdtypes/forcerec.h
+++ b/src/gromacs/mdtypes/forcerec.h
@@ -292,14 +292,9 @@ struct t_forcerec {
      gmx_bool          haveDirectVirialContributions;
  #ifdef __cplusplus
      /* TODO: Replace the pointer by an object once we got rid of C */
-    std::vector<gmx::RVec> *forceBufferForDirectVirialContributions;
-    /* This buffer is currently only used for storing the PME GPU output until reduction.
-     * TODO: Pagelock/pin it
-     * TODO: Replace the pointer by an object once we got rid of C */
-    std::vector<gmx::RVec>  *forceBufferIntermediate;
+    std::vector<gmx::RVec>  *forceBufferForDirectVirialContributions;
  #else
      void                    *forceBufferForDirectVirialContributions_dummy;
-    void                    *forceBufferIntermediate_dummy;
  #endif
  
      /* Data for PPPM/PME/Ewald */
author	Mark Abraham <mark.j.abraham@gmail.com>
	Mon, 6 Nov 2017 08:28:01 +0000 (09:28 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Fri, 17 Nov 2017 16:42:29 +0000 (17:42 +0100)
src/gromacs/ewald/pme-gather.cu		patch \| blob \| history
src/gromacs/ewald/pme-gpu-internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme-gpu-internal.h		patch \| blob \| history
src/gromacs/ewald/pme-gpu-types.h		patch \| blob \| history
src/gromacs/ewald/pme-gpu.cpp		patch \| blob \| history
src/gromacs/ewald/pme-only.cpp		patch \| blob \| history
src/gromacs/ewald/pme.cu		patch \| blob \| history
src/gromacs/ewald/pme.h		patch \| blob \| history
src/gromacs/ewald/tests/pmetestcommon.cpp		patch \| blob \| history
src/gromacs/mdlib/forcerec.cpp		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdtypes/forcerec.h		patch \| blob \| history