StatePropagatorDataGpu object to manage GPU forces, positions and velocities buffers

author Artem Zhmurov <zhmurov@gmail.com>

Tue, 3 Sep 2019 12:23:40 +0000 (14:23 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Tue, 1 Oct 2019 16:22:17 +0000 (18:22 +0200)
author Artem Zhmurov <zhmurov@gmail.com>
Tue, 3 Sep 2019 12:23:40 +0000 (14:23 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Tue, 1 Oct 2019 16:22:17 +0000 (18:22 +0200)
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp

index 2807a9759206d16cc94987a2820ad35c1bf25484..f3365e11a37900fbfe3cf8e79b62d2f310461bb4 100644 (file)
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -235,6 +235,11 @@ gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
      return &dd->comm->zones;
  }
  
+int dd_numAtomsZones(const gmx_domdec_t &dd)
+{
+    return dd.comm->atomRanges.end(DDAtomRanges::Type::Zones);
+}
+
  int dd_numHomeAtoms(const gmx_domdec_t &dd)
  {
      return dd.comm->atomRanges.numHomeAtoms();
diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h

index 9405a030ecff6a096d93475e5df5a7a6f9969d54..97b38a4300a396522115ffc07b233342cd47aec5 100644 (file)
--- a/src/gromacs/domdec/domdec.h
+++ b/src/gromacs/domdec/domdec.h
@@ -110,6 +110,9 @@ void dd_store_state(struct gmx_domdec_t *dd, t_state *state);
  /*! \brief Returns a pointer to the gmx_domdec_zones_t struct */
  struct gmx_domdec_zones_t *domdec_zones(struct gmx_domdec_t *dd);
  
+/*! \brief Returns the range for atoms in zones*/
+int dd_numAtomsZones(const gmx_domdec_t &dd);
+
  /*! \brief Returns the number of home atoms */
  int dd_numHomeAtoms(const gmx_domdec_t &dd);
  
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h

index 4187e45cc24ec6a6a43529fd0a75315b747b8695..f991e0512327e77f64d8bd830d1db2dbdae7d51b 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange.h
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -42,6 +42,7 @@
  #ifndef GMX_DOMDEC_GPUHALOEXCHANGE_H
  #define GMX_DOMDEC_GPUHALOEXCHANGE_H
  
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/math/vectypes.h"
  #include "gromacs/utility/basedefinitions.h"
  #include "gromacs/utility/classhelpers.h"
@@ -95,8 +96,9 @@ class GpuHaloExchange
           * \param [in] d_coordinateBuffer   pointer to coordinates buffer in GPU memory
           * \param [in] d_forcesBuffer   pointer to coordinates buffer in GPU memory
           */
-        void reinitHalo(rvec        *d_coordinateBuffer,
-                        rvec        *d_forcesBuffer);
+        void reinitHalo(DeviceBuffer<float>  d_coordinateBuffer,
+                        DeviceBuffer<float>  d_forcesBuffer);
+
  
          /*! \brief GPU halo exchange of coordinates buffer.
           *
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp

index 741c79519c0e6b88b5d5146c2f0915dc39d35a40..c4c191b314fb4f6aad8dc11ffa2745ca17759357 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -73,8 +73,8 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t * /* dd */,
  GpuHaloExchange::~GpuHaloExchange() = default;
  
  /*!\brief init halo exhange stub. */
-void GpuHaloExchange::reinitHalo(rvec * /* d_coordinatesBuffer */,
-                                 rvec * /* d_forcesBuffer */)
+void GpuHaloExchange::reinitHalo(DeviceBuffer<float> /* d_coordinatesBuffer */,
+                                 DeviceBuffer<float> /* d_forcesBuffer */)
  {
      GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
  }
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu

index dcb3bcd839b83c8e3b6102d9dd51613c5c266e52..c195f96cb577aff3e46c7bc57f4a937b0ff43bce 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -438,8 +438,8 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t *dd,
  
  GpuHaloExchange::~GpuHaloExchange() = default;
  
-void GpuHaloExchange::reinitHalo(rvec        *d_coordinatesBuffer,
-                                 rvec        *d_forcesBuffer)
+void GpuHaloExchange::reinitHalo(DeviceBuffer<float>  d_coordinatesBuffer,
+                                 DeviceBuffer<float>  d_forcesBuffer)
  {
      impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer), reinterpret_cast<float3*>(d_forcesBuffer));
  }
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h

index 6997d4e0eb137c8c0b13533adbf3671271c99cb8..c6ec4e6fca8d1c0d8110b4a0bf9bcbfcdf4ffaba 100644 (file)
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -365,24 +365,13 @@ GPU_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t      *GPU_FUNC_ARG
                                                      bool            GPU_FUNC_ARGUMENT(useGpuForceReduction)) GPU_FUNC_TERM;
  
  /*! \brief
- * Launches H2D input transfers for PME on GPU.
- *
- * \param[in] pme               The PME data structure.
- * \param[in] coordinatesHost   The array of local atoms' coordinates.
- * \param[in] wcycle            The wallclock counter.
- */
-GPU_FUNC_QUALIFIER void pme_gpu_copy_coordinates_to_gpu(gmx_pme_t            *GPU_FUNC_ARGUMENT(pme),
-                                                        const rvec           *GPU_FUNC_ARGUMENT(coordinatesHost),
-                                                        gmx_wallcycle        *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
-
-/*! \brief
- * Launches first stage of PME on GPU - spreading kernel, and D2H grid transfer if needed.
+ * Launches first stage of PME on GPU - spreading kernel.
   *
   * \param[in] pme                The PME data structure.
   * \param[in] wcycle             The wallclock counter.
   */
-GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t           *GPU_FUNC_ARGUMENT(pme),
-                                              gmx_wallcycle       *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t      *GPU_FUNC_ARGUMENT(pme),
+                                              gmx_wallcycle  *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
  
  /*! \brief
   * Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
@@ -476,6 +465,13 @@ GPU_FUNC_QUALIFIER void pme_gpu_reinit_computation(const gmx_pme_t *GPU_FUNC_ARG
   */
  GPU_FUNC_QUALIFIER DeviceBuffer<float> pme_gpu_get_device_x(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<float> {});
  
+/*! \brief Set pointer to device copy of coordinate data.
+ * \param[in] pme            The PME data structure.
+ * \param[in] d_x            The pointer to the positions buffer to be set
+ */
+GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t     *GPU_FUNC_ARGUMENT(pme),
+                                             DeviceBuffer<float>  GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
+
  /*! \brief Get pointer to device copy of force data.
   * \param[in] pme            The PME data structure.
   * \returns                  Pointer to force data
@@ -488,6 +484,12 @@ GPU_FUNC_QUALIFIER void *pme_gpu_get_device_f(const gmx_pme_t *GPU_FUNC_ARGUMENT
   */
  GPU_FUNC_QUALIFIER void *pme_gpu_get_device_stream(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
  
+/*! \brief Returns the pointer to the GPU context.
+ *  \param[in] pme            The PME data structure.
+ *  \returns                  Pointer to GPU context object.
+ */
+GPU_FUNC_QUALIFIER void *pme_gpu_get_device_context(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
  /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
   * \param[in] pme            The PME data structure.
   * \returns                  Pointer to sychronizer
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp

index ddbc2f9485be5afab91754ade5734f4e99c2aac5..4685913dbde8e7922d0bff0e983f875b96fb4b94 100644 (file)
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -174,23 +174,6 @@ void pme_gpu_prepare_computation(gmx_pme_t            *pme,
      }
  }
  
-void pme_gpu_copy_coordinates_to_gpu(gmx_pme_t            *pme,
-                                     const rvec           *coordinatesHost,
-                                     gmx_wallcycle        *wcycle)
-{
-    GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
-
-    PmeGpu *pmeGpu = pme->gpu;
-
-    // The only spot of PME GPU where LAUNCH_GPU counter increases call-count
-    wallcycle_start(wcycle, ewcLAUNCH_GPU);
-    // The only spot of PME GPU where ewcsLAUNCH_GPU_PME subcounter increases call-count
-    wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_PME);
-    pme_gpu_copy_input_coordinates(pmeGpu, coordinatesHost);
-    wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
-    wallcycle_stop(wcycle, ewcLAUNCH_GPU);
-}
-
  void pme_gpu_launch_spread(gmx_pme_t            *pme,
                             gmx_wallcycle        *wcycle)
  {
@@ -444,6 +427,15 @@ void *pme_gpu_get_device_f(const gmx_pme_t *pme)
      return pme_gpu_get_kernelparam_forces(pme->gpu);
  }
  
+void pme_gpu_set_device_x(const gmx_pme_t     *pme,
+                          DeviceBuffer<float>  d_x)
+{
+    GMX_ASSERT(pme != nullptr, "Null pointer is passed as a PME to the set coordinates function.");
+    GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
+
+    pme_gpu_set_kernelparam_coordinates(pme->gpu, d_x);
+}
+
  void *pme_gpu_get_device_stream(const gmx_pme_t *pme)
  {
      if (!pme || !pme_gpu_active(pme))
@@ -453,6 +445,15 @@ void *pme_gpu_get_device_stream(const gmx_pme_t *pme)
      return pme_gpu_get_stream(pme->gpu);
  }
  
+void *pme_gpu_get_device_context(const gmx_pme_t *pme)
+{
+    if (!pme || !pme_gpu_active(pme))
+    {
+        return nullptr;
+    }
+    return pme_gpu_get_context(pme->gpu);
+}
+
  GpuEventSynchronizer * pme_gpu_get_f_ready_synchronizer(const gmx_pme_t *pme)
  {
      if (!pme || !pme_gpu_active(pme))
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp

index c11580aa88de2466d29adec57411afefbb03dbc4..47ec41bb3c18c2f8954b04b4b7a04a10e6c1a27f 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -233,23 +233,6 @@ void pme_gpu_realloc_coordinates(const PmeGpu *pmeGpu)
      }
  }
  
-void pme_gpu_copy_input_coordinates(const PmeGpu *pmeGpu, const rvec *h_coordinates)
-{
-    GMX_ASSERT(h_coordinates, "Bad host-side coordinate buffer in PME GPU");
-#if GMX_DOUBLE
-    GMX_RELEASE_ASSERT(false, "Only single precision is supported");
-    GMX_UNUSED_VALUE(h_coordinates);
-#else
-    const float *h_coordinatesFloat = reinterpret_cast<const float *>(h_coordinates);
-    copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates, h_coordinatesFloat,
-                       0, pmeGpu->kernelParams->atoms.nAtoms * DIM,
-                       pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
-    // FIXME: sync required since the copied data will be used by PP stream when using single GPU for both
-    //        Remove after adding the required event-based sync between the above H2D and the transform kernel
-    pme_gpu_synchronize(pmeGpu);
-#endif
-}
-
  void pme_gpu_free_coordinates(const PmeGpu *pmeGpu)
  {
      freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coordinates);
@@ -967,7 +950,6 @@ void pme_gpu_destroy(PmeGpu *pmeGpu)
      pme_gpu_free_energy_virial(pmeGpu);
      pme_gpu_free_bspline_values(pmeGpu);
      pme_gpu_free_forces(pmeGpu);
-    pme_gpu_free_coordinates(pmeGpu);
      pme_gpu_free_coefficients(pmeGpu);
      pme_gpu_free_spline_data(pmeGpu);
      pme_gpu_free_grid_indices(pmeGpu);
@@ -1002,7 +984,6 @@ void pme_gpu_reinit_atoms(PmeGpu *pmeGpu, const int nAtoms, const real *charges)
  
      if (haveToRealloc)
      {
-        pme_gpu_realloc_coordinates(pmeGpu);
          pme_gpu_realloc_forces(pmeGpu);
          pme_gpu_realloc_spline_data(pmeGpu);
          pme_gpu_realloc_grid_indices(pmeGpu);
@@ -1317,6 +1298,44 @@ void * pme_gpu_get_kernelparam_forces(const PmeGpu *pmeGpu)
      }
  }
  
+/*! \brief Check the validity of the device buffer.
+ *
+ * Checks if the buffer is not nullptr and, when possible, if it is big enough.
+ *
+ * \todo Split and move this function to gpu_utils.
+ *
+ * \param[in] buffer        Device buffer to be checked.
+ * \param[in] requiredSize  Number of elements that the buffer will have to accommodate.
+ *
+ * \returns If the device buffer can be set.
+ */
+template<typename T>
+static bool checkDeviceBuffer(gmx_unused DeviceBuffer<T> buffer, gmx_unused int requiredSize)
+{
+#if GMX_GPU == GMX_GPU_CUDA
+    GMX_ASSERT(buffer != nullptr, "The device pointer is nullptr");
+    return buffer != nullptr;
+#elif GMX_GPU == GMX_GPU_OPENCL
+    size_t size;
+    int    retval = clGetMemObjectInfo(buffer, CL_MEM_SIZE, sizeof(size), &size, NULL);
+    GMX_ASSERT(retval == CL_SUCCESS, gmx::formatString("clGetMemObjectInfo failed with error code #%d", retval).c_str());
+    GMX_ASSERT(static_cast<int>(size) >= requiredSize, "Number of atoms in device buffer is smaller then required size.");
+    return retval == CL_SUCCESS && static_cast<int>(size) >= requiredSize;
+#elif GMX_GPU == GMX_GPU_NONE
+    GMX_ASSERT(false, "Setter for device-side coordinates was called in non-GPU build.");
+    return false;
+#endif
+}
+
+void pme_gpu_set_kernelparam_coordinates(const PmeGpu *pmeGpu, DeviceBuffer<float> d_x)
+{
+    GMX_ASSERT(pmeGpu && pmeGpu->kernelParams, "PME GPU device buffer can not be set in non-GPU builds or before the GPU PME was initialized.");
+
+    GMX_ASSERT(checkDeviceBuffer(d_x, pmeGpu->kernelParams->atoms.nAtoms), "The device-side buffer can not be set.");
+
+    pmeGpu->kernelParams->atoms.d_coordinates = d_x;
+}
+
  void * pme_gpu_get_stream(const PmeGpu *pmeGpu)
  {
      if (pmeGpu)
@@ -1329,6 +1348,18 @@ void * pme_gpu_get_stream(const PmeGpu *pmeGpu)
      }
  }
  
+void * pme_gpu_get_context(const PmeGpu *pmeGpu)
+{
+    if (pmeGpu)
+    {
+        return static_cast<void *>(&pmeGpu->archSpecific->context);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
  GpuEventSynchronizer *pme_gpu_get_forces_ready_synchronizer(const PmeGpu *pmeGpu)
  {
      if (pmeGpu && pmeGpu->kernelParams)
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h

index d6aafdbc85c8b7979aeba7dbd9e5fc659fc8e267..bdd92bb83be24a3bcd10bc147c8a638bf9cd5640 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -185,17 +185,6 @@ bool pme_gpu_stream_query(const PmeGpu *pmeGpu);
   */
  void pme_gpu_realloc_coordinates(const PmeGpu *pmeGpu);
  
-/*! \libinternal \brief
- * Copies the input coordinates from the CPU buffer onto the GPU.
- *
- * \param[in] pmeGpu            The PME GPU structure.
- * \param[in] h_coordinates     Input coordinates (XYZ rvec array).
- *
- * Needs to be called for every PME computation. The coordinates are then used in the spline calculation.
- */
-GPU_FUNC_QUALIFIER void pme_gpu_copy_input_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
-                                                       const rvec   *GPU_FUNC_ARGUMENT(h_coordinates)) GPU_FUNC_TERM;
-
  /*! \libinternal \brief
   * Frees the coordinates on the GPU.
   *
@@ -448,6 +437,13 @@ GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu                *GPU_FUNC_ARGUMENT(
   */
  GPU_FUNC_QUALIFIER DeviceBuffer<float> pme_gpu_get_kernelparam_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(DeviceBuffer<float> {});
  
+/*! \brief Sets the device pointer to coordinate data
+ * \param[in] pmeGpu         The PME GPU structure.
+ * \param[in] d_x            Pointer to coordinate data
+ */
+GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu        *GPU_FUNC_ARGUMENT(pmeGpu),
+                                                            DeviceBuffer<float>  GPU_FUNC_ARGUMENT(d_x)) GPU_FUNC_TERM;
+
  /*! \brief Return pointer to device copy of force data.
   * \param[in] pmeGpu         The PME GPU structure.
   * \returns                  Pointer to force data
@@ -460,6 +456,12 @@ GPU_FUNC_QUALIFIER void * pme_gpu_get_kernelparam_forces(const PmeGpu *GPU_FUNC_
   */
  GPU_FUNC_QUALIFIER void * pme_gpu_get_stream(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
  
+/*! \brief Return pointer to GPU context (for OpenCL builds).
+ * \param[in] pmeGpu         The PME GPU structure.
+ * \returns                  Pointer to context object.
+ */
+GPU_FUNC_QUALIFIER void * pme_gpu_get_context(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
  /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
   * \param[in] pmeGpu         The PME GPU structure.
   * \returns                  Pointer to sync object
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp

index 1a872bb383eb3a3ca16cdbb57482447343ae57cb..eb81ebb79f90b9d458ed94da5c32a0c0178b9890 100644 (file)
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -84,6 +84,7 @@
  #include "gromacs/mdtypes/commrec.h"
  #include "gromacs/mdtypes/forceoutput.h"
  #include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
  #include "gromacs/timing/cyclecounter.h"
  #include "gromacs/timing/wallcycle.h"
  #include "gromacs/utility/fatalerror.h"
@@ -543,15 +544,21 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
      std::vector<gmx_pme_t *> pmedata;
      pmedata.push_back(pme);
  
-    auto       pme_pp       = gmx_pme_pp_init(cr);
+    auto        pme_pp       = gmx_pme_pp_init(cr);
      //TODO the variable below should be queried from the task assignment info
-    const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed);
+    const bool  useGpuForPme   = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed);
+    const void *commandStream  = useGpuForPme ? pme_gpu_get_device_context(pme) : nullptr;
+    const void *gpuContext     = useGpuForPme ? pme_gpu_get_device_stream(pme) : nullptr;
+    const int   paddingSize    = pme_gpu_get_padding_size(pme);
      if (useGpuForPme)
      {
          changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
          changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
      }
  
+    // Unconditionally initialize the StatePropagatorDataGpu object to get more verbose message if it is used from CPU builds
+    auto stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(commandStream, gpuContext, GpuApiCallBehavior::Sync, paddingSize);
+
      clear_nrnb(mynrnb);
  
      count = 0;
@@ -585,6 +592,11 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
              if (atomSetChanged)
              {
                  gmx_pme_reinit_atoms(pme, natoms, pme_pp->chargeA.data());
+                if (useGpuForPme)
+                {
+                    stateGpu->reinit(natoms, natoms);
+                    pme_gpu_set_device_x(pme, stateGpu->getCoordinates());
+                }
              }
  
              if (ret == pmerecvqxRESETCOUNTERS)
@@ -625,7 +637,8 @@ int gmx_pmeonly(struct gmx_pme_t *pme,
              //TODO this should be set properly by gmx_pme_recv_coeffs_coords,
              // or maybe use inputrecDynamicBox(ir), at the very least - change this when this codepath is tested!
              pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags, useGpuPmeForceReduction);
-            pme_gpu_copy_coordinates_to_gpu(pme, as_rvec_array(pme_pp->x.data()), wcycle);
+            stateGpu->copyCoordinatesToGpu(gmx::ArrayRef<gmx::RVec>(pme_pp->x), gmx::StatePropagatorDataGpu::AtomLocality::All);
+
              pme_gpu_launch_spread(pme, wcycle);
              pme_gpu_launch_complex_transforms(pme, wcycle);
              pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
diff --git a/src/gromacs/ewald/tests/pmegathertest.cpp b/src/gromacs/ewald/tests/pmegathertest.cpp

index db725106368f822647efe924a48a15dd936df965..0d4e94db6c93d0f514ad6861435eb10c6205c760 100644 (file)
--- a/src/gromacs/ewald/tests/pmegathertest.cpp
+++ b/src/gromacs/ewald/tests/pmegathertest.cpp
@@ -388,12 +388,13 @@ class PmeGatherTest : public ::testing::TestWithParam<GatherInputParameters>
              TestReferenceData refData;
              for (const auto &context : getPmeTestEnv()->getHardwareContexts())
              {
+                std::shared_ptr<StatePropagatorDataGpu> stateGpu;
                  CodePath   codePath       = context->getCodePath();
                  const bool supportedInput = pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
                  if (!supportedInput)
                  {
                      /* Testing the failure for the unsupported input */
-                    EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, inputAtomData.coordinates, inputAtomData.charges, box), NotImplementedError);
+                    EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, inputAtomData.coordinates, inputAtomData.charges, box, stateGpu), NotImplementedError);
                      continue;
                  }
  
@@ -408,7 +409,7 @@ class PmeGatherTest : public ::testing::TestWithParam<GatherInputParameters>
                                            ));
  
                  PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context->getDeviceInfo(),
-                                                      context->getPmeGpuProgram(), inputAtomData.coordinates, inputAtomData.charges, box);
+                                                      context->getPmeGpuProgram(), inputAtomData.coordinates, inputAtomData.charges, box, stateGpu);
  
                  /* Setting some more inputs */
                  pmeSetRealGrid(pmeSafe.get(), codePath, nonZeroGridValues);
diff --git a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp

index eef1b9fe2d460813e1d249d777a95749149333f2..a1c6eb4c1dc6c7147769a50e3d07f00c2d1e826e 100644 (file)
--- a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
+++ b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
@@ -122,12 +122,13 @@ class PmeSplineAndSpreadTest : public ::testing::TestWithParam<SplineAndSpreadIn
  
              for (const auto &context : getPmeTestEnv()->getHardwareContexts())
              {
+                std::shared_ptr<StatePropagatorDataGpu> stateGpu;
                  CodePath   codePath       = context->getCodePath();
                  const bool supportedInput = pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
                  if (!supportedInput)
                  {
                      /* Testing the failure for the unsupported input */
-                    EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, coordinates, charges, box), NotImplementedError);
+                    EXPECT_THROW(pmeInitAtoms(&inputRec, codePath, nullptr, nullptr, coordinates, charges, box, stateGpu), NotImplementedError);
                      continue;
                  }
  
@@ -146,7 +147,7 @@ class PmeSplineAndSpreadTest : public ::testing::TestWithParam<SplineAndSpreadIn
                      /* Running the test */
  
                      PmeSafePointer pmeSafe = pmeInitAtoms(&inputRec, codePath, context->getDeviceInfo(),
-                                                          context->getPmeGpuProgram(), coordinates, charges, box);
+                                                          context->getPmeGpuProgram(), coordinates, charges, box, stateGpu);
  
                      const bool     computeSplines = (option.first == PmeSplineAndSpreadOptions::SplineOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
                      const bool     spreadCharges  = (option.first == PmeSplineAndSpreadOptions::SpreadOnly) || (option.first == PmeSplineAndSpreadOptions::SplineAndSpreadUnified);
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp

index d5ce8887984421f78defbeebbebeb1d15deb1fa7..61b28e587ab4cfae47f4e1f7cf19a214ae970cf9 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -169,13 +169,14 @@ PmeSafePointer pmeInitEmpty(const t_inputrec         *inputRec,
  }
  
  //! PME initialization with atom data
-PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
-                            CodePath                  mode,
-                            const gmx_device_info_t  *gpuInfo,
-                            PmeGpuProgramHandle       pmeGpuProgram,
-                            const CoordinatesVector  &coordinates,
-                            const ChargesVector      &charges,
-                            const Matrix3x3          &box
+PmeSafePointer pmeInitAtoms(const t_inputrec                        *inputRec,
+                            CodePath                                 mode,
+                            const gmx_device_info_t                 *gpuInfo,
+                            PmeGpuProgramHandle                      pmeGpuProgram,
+                            const CoordinatesVector                 &coordinates,
+                            const ChargesVector                     &charges,
+                            const Matrix3x3                         &box,
+                            std::shared_ptr<StatePropagatorDataGpu>  stateGpu
                              )
  {
      const index     atomCount = coordinates.size();
@@ -199,7 +200,16 @@ PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
              // We need to set atc->n for passing the size in the tests
              atc->setNumAtoms(atomCount);
              gmx_pme_reinit_atoms(pmeSafe.get(), atomCount, charges.data());
-            pme_gpu_copy_input_coordinates(pmeSafe->gpu, as_rvec_array(coordinates.data()));
+
+            // TODO: Pin the host buffer and use async memory copies
+            stateGpu = std::make_unique<StatePropagatorDataGpu>(pme_gpu_get_device_stream(pmeSafe.get()),
+                                                                pme_gpu_get_device_context(pmeSafe.get()),
+                                                                GpuApiCallBehavior::Sync,
+                                                                pme_gpu_get_padding_size(pmeSafe.get()));
+            stateGpu->reinit(atomCount, atomCount);
+            stateGpu->copyCoordinatesToGpu(arrayRefFromArray(coordinates.data(), coordinates.size()), gmx::StatePropagatorDataGpu::AtomLocality::All);
+            pme_gpu_set_kernelparam_coordinates(pmeSafe->gpu, stateGpu->getCoordinates());
+
              break;
  
          default:
diff --git a/src/gromacs/ewald/tests/pmetestcommon.h b/src/gromacs/ewald/tests/pmetestcommon.h

index e9290dbae941db3875c74149a3bc314e9e87b199..b51551c29a6041c558c74435010682209d96ef77 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.h
+++ b/src/gromacs/ewald/tests/pmetestcommon.h
@@ -51,6 +51,7 @@
  #include "gromacs/ewald/pme.h"
  #include "gromacs/ewald/pme_gpu_internal.h"
  #include "gromacs/math/gmxcomplex.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
  #include "gromacs/utility/arrayref.h"
  #include "gromacs/utility/unique_cptr.h"
  
@@ -125,13 +126,14 @@ PmeSafePointer pmeInitEmpty(const t_inputrec *inputRec,
                              const Matrix3x3 &box = {{1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F}},
                              real ewaldCoeff_q = 0.0F, real ewaldCoeff_lj = 0.0F);
  //! PME initialization with atom data and system box
-PmeSafePointer pmeInitAtoms(const t_inputrec         *inputRec,
-                            CodePath                  mode,
-                            const gmx_device_info_t  *gpuInfo,
-                            PmeGpuProgramHandle       pmeGpuProgram,
-                            const CoordinatesVector  &coordinates,
-                            const ChargesVector      &charges,
-                            const Matrix3x3          &box
+PmeSafePointer pmeInitAtoms(const t_inputrec                        *inputRec,
+                            CodePath                                 mode,
+                            const gmx_device_info_t                 *gpuInfo,
+                            PmeGpuProgramHandle                      pmeGpuProgram,
+                            const CoordinatesVector                 &coordinates,
+                            const ChargesVector                     &charges,
+                            const Matrix3x3                         &box,
+                            std::shared_ptr<StatePropagatorDataGpu>  stateGpu
                              );
  //! PME spline computation and charge spreading
  void pmePerformSplineAndSpread(gmx_pme_t *pme, CodePath mode,
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 16c891532dc3865e87dd4ea43338cb0f8ffb9662..a77e5751cff80ab82b91a1f749564ae24a09433a 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -86,6 +86,7 @@
  #include "gromacs/mdtypes/md_enums.h"
  #include "gromacs/mdtypes/simulation_workload.h"
  #include "gromacs/mdtypes/state.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
  #include "gromacs/nbnxm/atomdata.h"
  #include "gromacs/nbnxm/gpu_data_mgmt.h"
  #include "gromacs/nbnxm/nbnxm.h"
@@ -601,7 +602,6 @@ computeSpecialForces(FILE                          *fplog,
   *
   * \param[in]  pmedata              The PME structure
   * \param[in]  box                  The box matrix
- * \param[in]  x                    Coordinate array
   * \param[in]  stepWork             Step schedule flags
   * \param[in]  pmeFlags             PME flags
   * \param[in]  useGpuForceReduction True if GPU-based force reduction is active this step
@@ -609,14 +609,12 @@ computeSpecialForces(FILE                          *fplog,
   */
  static inline void launchPmeGpuSpread(gmx_pme_t          *pmedata,
                                        const matrix        box,
-                                      const rvec          x[],
                                        const StepWorkload &stepWork,
                                        int                 pmeFlags,
                                        bool                useGpuForceReduction,
                                        gmx_wallcycle_t     wcycle)
  {
      pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, useGpuForceReduction);
-    pme_gpu_copy_coordinates_to_gpu(pmedata, x, wcycle);
      pme_gpu_launch_spread(pmedata, wcycle);
  }
  
@@ -889,12 +887,13 @@ void do_force(FILE                                     *fplog,
                int                                       legacyFlags,
                const DDBalanceRegionHandler             &ddBalanceRegionHandler)
  {
-    int                  i, j;
-    double               mu[2*DIM];
-    gmx_bool             bFillGrid, bCalcCGCM;
-    gmx_bool             bUseGPU, bUseOrEmulGPU;
-    nonbonded_verlet_t  *nbv = fr->nbv.get();
-    interaction_const_t *ic  = fr->ic;
+    int                          i, j;
+    double                       mu[2*DIM];
+    gmx_bool                     bFillGrid, bCalcCGCM;
+    gmx_bool                     bUseGPU, bUseOrEmulGPU;
+    nonbonded_verlet_t          *nbv      = fr->nbv.get();
+    interaction_const_t         *ic       = fr->ic;
+    gmx::StatePropagatorDataGpu *stateGpu = fr->stateGpu;
  
      // TODO remove the code below when the legacy flags are not in use anymore
      /* modify force flag if not doing nonbonded */
@@ -998,9 +997,27 @@ void do_force(FILE                                     *fplog,
      }
  #endif /* GMX_MPI */
  
+    // Coordinates on the device are needed if PME or BufferOps are offloaded.
+    // The local coordinates can be copied right away.
+    // NOTE: Consider moving this copy to right after they are updated and constrained,
+    //       if the later is not offloaded.
+    if (useGpuPme || useGpuXBufOps == BufferOpsUseGpu::True)
+    {
+        if (stepWork.doNeighborSearch)
+        {
+            stateGpu->reinit(mdatoms->homenr, cr->dd != nullptr ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr);
+            if (useGpuPme)
+            {
+                // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
+                pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
+            }
+        }
+        stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
+    }
+
      if (useGpuPme)
      {
-        launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), stepWork, pmeFlags, useGpuPmeFReduction, wcycle);
+        launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, useGpuPmeFReduction, wcycle);
      }
  
      /* do gridding for pair search */
@@ -1124,14 +1141,8 @@ void do_force(FILE                                     *fplog,
      {
          if (useGpuXBufOps == BufferOpsUseGpu::True)
          {
-            // The condition here was (pme != nullptr && pme_gpu_get_device_x(fr->pmedata) != nullptr)
-            if (!useGpuPme)
-            {
-                nbv->copyCoordinatesToGpu(Nbnxm::AtomLocality::Local, false,
-                                          x.unpaddedArrayRef());
-            }
              nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::Local, false,
-                                       useGpuPme ? pme_gpu_get_device_x(fr->pmedata) : nbv->getDeviceCoordinates());
+                                       stateGpu->getCoordinates());
          }
          else
          {
@@ -1210,9 +1221,7 @@ void do_force(FILE                                     *fplog,
              wallcycle_stop(wcycle, ewcNS);
              if (ddUsesGpuDirectCommunication)
              {
-                rvec* d_x    = static_cast<rvec *> (nbv->get_gpu_xrvec());
-                rvec* d_f    = static_cast<rvec *> (nbv->get_gpu_frvec());
-                gpuHaloExchange->reinitHalo(d_x, d_f);
+                gpuHaloExchange->reinitHalo(stateGpu->getCoordinates(), stateGpu->getForces());
              }
          }
          else
@@ -1226,7 +1235,7 @@ void do_force(FILE                                     *fplog,
                  if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
                  {
                      //non-local part of coordinate buffer must be copied back to host for CPU work
-                    nbv->launch_copy_x_from_gpu(as_rvec_array(x.unpaddedArrayRef().data()), Nbnxm::AtomLocality::NonLocal);
+                    stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
                  }
              }
              else
@@ -1239,11 +1248,10 @@ void do_force(FILE                                     *fplog,
                  // The condition here was (pme != nullptr && pme_gpu_get_device_x(fr->pmedata) != nullptr)
                  if (!useGpuPme && !ddUsesGpuDirectCommunication)
                  {
-                    nbv->copyCoordinatesToGpu(Nbnxm::AtomLocality::NonLocal, false,
-                                              x.unpaddedArrayRef());
+                    stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
                  }
                  nbv->convertCoordinatesGpu(Nbnxm::AtomLocality::NonLocal, false,
-                                           useGpuPme ? pme_gpu_get_device_x(fr->pmedata) : nbv->getDeviceCoordinates());
+                                           stateGpu->getCoordinates());
              }
              else
              {
@@ -1494,17 +1502,16 @@ void do_force(FILE                                     *fplog,
                  // which are a dependency for the GPU force reduction.
                  bool  haveNonLocalForceContribInCpuBuffer = domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork;
  
-                rvec *f = as_rvec_array(forceWithShiftForces.force().data());
                  if (haveNonLocalForceContribInCpuBuffer)
                  {
-                    nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::NonLocal);
+                    stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
                  }
                  nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::NonLocal,
-                                                  nbv->getDeviceForces(),
+                                                  stateGpu->getForces(),
                                                    pme_gpu_get_device_f(fr->pmedata),
                                                    pme_gpu_get_f_ready_synchronizer(fr->pmedata),
                                                    useGpuPmeFReduction, haveNonLocalForceContribInCpuBuffer);
-                nbv->launch_copy_f_from_gpu(f, Nbnxm::AtomLocality::NonLocal);
+                stateGpu->copyForcesFromGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
              }
              else
              {
@@ -1538,17 +1545,14 @@ void do_force(FILE                                     *fplog,
  
          if (stepWork.computeForces)
          {
-            gmx::ArrayRef<gmx::RVec>  force  = forceOut.forceWithShiftForces().force();
-            rvec                     *f      = as_rvec_array(force.data());
  
              if (useGpuForcesHaloExchange)
              {
                  if (haveCpuLocalForces)
                  {
-                    nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local);
+                    stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::Local);
                  }
-                bool accumulateHaloForces = haveCpuLocalForces;
-                gpuHaloExchange->communicateHaloForces(accumulateHaloForces);
+                gpuHaloExchange->communicateHaloForces(haveCpuLocalForces);
              }
              else
              {
@@ -1643,10 +1647,9 @@ void do_force(FILE                                     *fplog,
              // - copy is not perfomed if GPU force halo exchange is active, because it would overwrite the result
              //   of the halo exchange. In that case the copy is instead performed above, before the exchange.
              //   These should be unified.
-            rvec *f = as_rvec_array(forceWithShift.data());
              if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange)
              {
-                nbv->launch_copy_f_to_gpu(f, Nbnxm::AtomLocality::Local);
+                stateGpu->copyForcesToGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local);
              }
              if (useGpuForcesHaloExchange)
              {
@@ -1658,12 +1661,13 @@ void do_force(FILE                                     *fplog,
                  nbv->stream_local_wait_for_nonlocal();
              }
              nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::Local,
-                                              nbv->getDeviceForces(),
+                                              stateGpu->getForces(),
                                                pme_gpu_get_device_f(fr->pmedata),
                                                pme_gpu_get_f_ready_synchronizer(fr->pmedata),
                                                useGpuPmeFReduction, haveLocalForceContribInCpuBuffer);
-            nbv->launch_copy_f_from_gpu(f, Nbnxm::AtomLocality::Local);
+            // This function call synchronizes the local stream
              nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local);
+            stateGpu->copyForcesFromGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local);
          }
          else
          {
diff --git a/src/gromacs/mdlib/update_constrain_cuda.h b/src/gromacs/mdlib/update_constrain_cuda.h

index 46b173ddab237a18f74283c728f1db075276a529..714ad7e8247f685258e88473b16d1156bc86c6c0 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_cuda.h
+++ b/src/gromacs/mdlib/update_constrain_cuda.h
@@ -48,6 +48,7 @@
  #ifndef GMX_MDLIB_UPDATE_CONSTRAIN_CUDA_H
  #define GMX_MDLIB_UPDATE_CONSTRAIN_CUDA_H
  
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/mdtypes/group.h"
  #include "gromacs/utility/arrayref.h"
  #include "gromacs/utility/classhelpers.h"
@@ -84,7 +85,7 @@ class UpdateConstrainCuda
           * This will extract temperature scaling factors from tcstat, transform them into the plain
           * array and call the normal integrate method.
           *
-         * \param[in]  dt                     Timestep
+         * \param[in]  dt                     Timestep.
           * \param[in]  updateVelocities       If the velocities should be constrained.
           * \param[in]  computeVirial          If virial should be updated.
           * \param[out] virial                 Place to save virial tensor.
@@ -104,16 +105,21 @@ class UpdateConstrainCuda
                         float                             dtPressureCouple,
                         const matrix                      velocityScalingMatrix);
  
-        /*! \brief
-         * Update data-structures (e.g. after NB search step).
+        /*! \brief Set the pointers and update data-structures (e.g. after NB search step).
           *
-         * \param[in] idef                 System topology
-         * \param[in] md                   Atoms data.
-         * \param[in] numTempScaleValues   Number of temperature scaling groups. Zero for no temperature scaling.
+         * \param[in,out]  d_x                 Device buffer with coordinates.
+         * \param[in,out]  d_v                 Device buffer with velocities.
+         * \param[in]      d_f                 Device buffer with forces.
+         * \param[in]      idef                System topology
+         * \param[in]      md                  Atoms data.
+         * \param[in]      numTempScaleValues  Number of temperature scaling groups. Zero for no temperature scaling.
           */
-        void set(const t_idef    &idef,
-                 const t_mdatoms &md,
-                 int              numTempScaleValues);
+        void set(DeviceBuffer<float>  d_x,
+                 DeviceBuffer<float>  d_v,
+                 DeviceBuffer<float>  d_f,
+                 const t_idef        &idef,
+                 const t_mdatoms     &md,
+                 int                  numTempScaleValues);
  
          /*! \brief
           * Update PBC data.
@@ -124,72 +130,6 @@ class UpdateConstrainCuda
           */
          void setPbc(const t_pbc *pbc);
  
-        /*! \brief
-         * Copy coordinates from CPU to GPU.
-         *
-         * The data are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_x  CPU pointer where coordinates should be copied from.
-         */
-        void copyCoordinatesToGpu(const rvec *h_x);
-
-        /*! \brief
-         * Copy velocities from CPU to GPU.
-         *
-         * The data are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_v  CPU pointer where velocities should be copied from.
-         */
-        void copyVelocitiesToGpu(const rvec *h_v);
-
-        /*! \brief
-         * Copy forces from CPU to GPU.
-         *
-         * The data are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_f  CPU pointer where forces should be copied from.
-         */
-        void copyForcesToGpu(const rvec *h_f);
-
-        /*! \brief
-         * Copy coordinates from GPU to CPU.
-         *
-         * The data are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[out] h_xp CPU pointer where coordinates should be copied to.
-         */
-        void copyCoordinatesFromGpu(rvec *h_xp);
-
-        /*! \brief
-         * Copy velocities from GPU to CPU.
-         *
-         * The velocities are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_v  Pointer to velocities data.
-         */
-        void copyVelocitiesFromGpu(rvec *h_v);
-
-        /*! \brief
-         * Copy forces from GPU to CPU.
-         *
-         * The forces are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_f  Pointer to forces data.
-         */
-        void copyForcesFromGpu(rvec *h_f);
-
-        /*! \brief
-         * Set the internal GPU-memory d_x, d_xp and d_v pointers.
-         *
-         * Data is not copied. The data are assumed to be in float3/fvec format
-         * (float3 is used internally, but the data layout should be identical).
-         *
-         * \param[in] d_x  Pointer to the coordinates for the input (on GPU)
-         * \param[in] d_xp Pointer to the coordinates for the output (on GPU)
-         * \param[in] d_v  Pointer to the velocities (on GPU)
-         * \param[in] d_f  Pointer to the forces (on GPU)
-         */
-        void setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f);
  
      private:
          class Impl;
diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cpp b/src/gromacs/mdlib/update_constrain_cuda_impl.cpp

index 075ca163d51abdb470cd497091fb70e67d2d64cd..b5823ed97fbbe55612fb4fc6a5dd237719192068 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_cuda_impl.cpp
+++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cpp
@@ -78,9 +78,12 @@ void UpdateConstrainCuda::integrate(gmx_unused const real
      GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
  }
  
-void UpdateConstrainCuda::set(gmx_unused const t_idef     &idef,
-                              gmx_unused const t_mdatoms  &md,
-                              gmx_unused const int         numTempScaleValues)
+void UpdateConstrainCuda::set(gmx_unused DeviceBuffer<float>        d_x,
+                              gmx_unused DeviceBuffer<float>        d_v,
+                              gmx_unused const DeviceBuffer<float>  d_f,
+                              gmx_unused const t_idef              &idef,
+                              gmx_unused const t_mdatoms           &md,
+                              gmx_unused const int                  numTempScaleValues)
  {
      GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
  }
@@ -90,44 +93,6 @@ void UpdateConstrainCuda::setPbc(gmx_unused const t_pbc *pbc)
      GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
  }
  
-void UpdateConstrainCuda::copyCoordinatesToGpu(gmx_unused const rvec *h_x)
-{
-    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
-void UpdateConstrainCuda::copyVelocitiesToGpu(gmx_unused const rvec *h_v)
-{
-    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
-void UpdateConstrainCuda::copyForcesToGpu(gmx_unused const rvec *h_f)
-{
-    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
-void UpdateConstrainCuda::copyCoordinatesFromGpu(gmx_unused rvec *h_xp)
-{
-    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
-void UpdateConstrainCuda::copyVelocitiesFromGpu(gmx_unused rvec *h_v)
-{
-    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
-void UpdateConstrainCuda::copyForcesFromGpu(gmx_unused rvec *h_f)
-{
-    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
-void UpdateConstrainCuda::setXVFPointers(gmx_unused rvec *d_x,
-                                         gmx_unused rvec *d_xp,
-                                         gmx_unused rvec *d_v,
-                                         gmx_unused rvec *d_f)
-{
-    GMX_ASSERT(false, "A CPU stub for UpdateConstrain was called insted of the correct implementation.");
-}
-
  }      // namespace gmx
  
  #endif /* GMX_GPU != GMX_GPU_CUDA */
diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.cu b/src/gromacs/mdlib/update_constrain_cuda_impl.cu

index f373d2791135c56cd38695e1ed9238ae2687523a..9ba96267380acc8efc92e24ce88857a6bf5f9871 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_cuda_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_cuda_impl.cu
@@ -105,6 +105,9 @@ void UpdateConstrainCuda::Impl::integrate(const real                        dt,
          }
      }
  
+    // TODO: This should be eliminated
+    cudaMemcpy(d_x_, d_xp_, numAtoms_*sizeof(float3), cudaMemcpyDeviceToDevice);
+
      return;
  }
  
@@ -124,16 +127,24 @@ UpdateConstrainCuda::Impl::~Impl()
  {
  }
  
-void UpdateConstrainCuda::Impl::set(const t_idef    &idef,
-                                    const t_mdatoms &md,
-                                    const int        numTempScaleValues)
+void UpdateConstrainCuda::Impl::set(DeviceBuffer<float>        d_x,
+                                    DeviceBuffer<float>        d_v,
+                                    const DeviceBuffer<float>  d_f,
+                                    const t_idef              &idef,
+                                    const t_mdatoms           &md,
+                                    const int                  numTempScaleValues)
  {
+    GMX_ASSERT(d_x != nullptr, "Coordinates device buffer should not be null.");
+    GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null.");
+    GMX_ASSERT(d_f != nullptr, "Forces device buffer should not be null.");
+
+    d_x_ = reinterpret_cast<float3*>(d_x);
+    d_v_ = reinterpret_cast<float3*>(d_v);
+    d_f_ = reinterpret_cast<float3*>(d_f);
+
      numAtoms_ = md.nr;
  
-    reallocateDeviceBuffer(&d_x_,  numAtoms_, &numX_,  &numXAlloc_,  nullptr);
      reallocateDeviceBuffer(&d_xp_, numAtoms_, &numXp_, &numXpAlloc_, nullptr);
-    reallocateDeviceBuffer(&d_v_,  numAtoms_, &numV_,  &numVAlloc_,  nullptr);
-    reallocateDeviceBuffer(&d_f_,  numAtoms_, &numF_,  &numFAlloc_,  nullptr);
  
      reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_,
                             &numInverseMasses_, &numInverseMassesAlloc_, nullptr);
@@ -152,44 +163,6 @@ void UpdateConstrainCuda::Impl::setPbc(const t_pbc *pbc)
      settleCuda_->setPbc(pbc);
  }
  
-void UpdateConstrainCuda::Impl::copyCoordinatesToGpu(const rvec *h_x)
-{
-    copyToDeviceBuffer(&d_x_, (float3*)h_x, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
-void UpdateConstrainCuda::Impl::copyVelocitiesToGpu(const rvec *h_v)
-{
-    copyToDeviceBuffer(&d_v_, (float3*)h_v, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
-void UpdateConstrainCuda::Impl::copyForcesToGpu(const rvec *h_f)
-{
-    copyToDeviceBuffer(&d_f_, (float3*)h_f, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
-void UpdateConstrainCuda::Impl::copyCoordinatesFromGpu(rvec *h_xp)
-{
-    copyFromDeviceBuffer((float3*)h_xp, &d_xp_, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
-void UpdateConstrainCuda::Impl::copyVelocitiesFromGpu(rvec *h_v)
-{
-    copyFromDeviceBuffer((float3*)h_v, &d_v_, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
-void UpdateConstrainCuda::Impl::copyForcesFromGpu(rvec *h_f)
-{
-    copyFromDeviceBuffer((float3*)h_f, &d_f_, 0, numAtoms_, commandStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
-void UpdateConstrainCuda::Impl::setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f)
-{
-    d_x_  = (float3*)d_x;
-    d_xp_ = (float3*)d_xp;
-    d_v_  = (float3*)d_v;
-    d_f_  = (float3*)d_f;
-}
-
  UpdateConstrainCuda::UpdateConstrainCuda(const t_inputrec  &ir,
                                           const gmx_mtop_t  &mtop,
                                           const void        *commandStream)
@@ -207,18 +180,21 @@ void UpdateConstrainCuda::integrate(const real                        dt,
                                      gmx::ArrayRef<const t_grp_tcstat> tcstat,
                                      const bool                        doPressureCouple,
                                      const float                       dtPressureCouple,
-                                    const matrix                      pRVScalingMatrix)
+                                    const matrix                      velocityScalingMatrix)
  {
      impl_->integrate(dt, updateVelocities, computeVirial, virialScaled,
                       doTempCouple, tcstat,
-                     doPressureCouple, dtPressureCouple, pRVScalingMatrix);
+                     doPressureCouple, dtPressureCouple, velocityScalingMatrix);
  }
  
-void UpdateConstrainCuda::set(const t_idef    &idef,
-                              const t_mdatoms &md,
-                              const int        numTempScaleValues)
+void UpdateConstrainCuda::set(DeviceBuffer<float>        d_x,
+                              DeviceBuffer<float>        d_v,
+                              const DeviceBuffer<float>  d_f,
+                              const t_idef              &idef,
+                              const t_mdatoms           &md,
+                              const int                  numTempScaleValues)
  {
-    impl_->set(idef, md, numTempScaleValues);
+    impl_->set(d_x, d_v, d_f, idef, md, numTempScaleValues);
  }
  
  void UpdateConstrainCuda::setPbc(const t_pbc *pbc)
@@ -226,39 +202,4 @@ void UpdateConstrainCuda::setPbc(const t_pbc *pbc)
      impl_->setPbc(pbc);
  }
  
-void UpdateConstrainCuda::copyCoordinatesToGpu(const rvec *h_x)
-{
-    impl_->copyCoordinatesToGpu(h_x);
-}
-
-void UpdateConstrainCuda::copyVelocitiesToGpu(const rvec *h_v)
-{
-    impl_->copyVelocitiesToGpu(h_v);
-}
-
-void UpdateConstrainCuda::copyForcesToGpu(const rvec *h_f)
-{
-    impl_->copyForcesToGpu(h_f);
-}
-
-void UpdateConstrainCuda::copyCoordinatesFromGpu(rvec *h_xp)
-{
-    impl_->copyCoordinatesFromGpu(h_xp);
-}
-
-void UpdateConstrainCuda::copyVelocitiesFromGpu(rvec *h_v)
-{
-    impl_->copyVelocitiesFromGpu(h_v);
-}
-
-void UpdateConstrainCuda::copyForcesFromGpu(rvec *h_f)
-{
-    impl_->copyForcesFromGpu(h_f);
-}
-
-void UpdateConstrainCuda::setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f)
-{
-    impl_->setXVFPointers(d_x, d_xp, d_v, d_f);
-}
-
  } //namespace gmx
diff --git a/src/gromacs/mdlib/update_constrain_cuda_impl.h b/src/gromacs/mdlib/update_constrain_cuda_impl.h

index b2f4c0eb5e151bbb0ffd88b800fa41b0819b9caf..652dd84eb6e44fc93eb57c57a0a2f79095b62a9f 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_cuda_impl.h
+++ b/src/gromacs/mdlib/update_constrain_cuda_impl.h
@@ -87,7 +87,7 @@ class UpdateConstrainCuda::Impl
           *   2. This is the temperature coupling step.
           * Parameters virial/lambdas can be nullptr if computeVirial/doTempCouple are false.
           *
-         * \param[in]  dt                     Timestep
+         * \param[in]  dt                     Timestep.
           * \param[in]  updateVelocities       If the velocities should be constrained.
           * \param[in]  computeVirial          If virial should be updated.
           * \param[out] virial                 Place to save virial tensor.
@@ -97,26 +97,31 @@ class UpdateConstrainCuda::Impl
           * \param[in]  dtPressureCouple       Period between pressure coupling steps
           * \param[in]  velocityScalingMatrix  Parrinello-Rahman velocity scaling matrix
           */
-        void integrate(const real                        dt,
-                       const bool                        updateVelocities,
-                       const bool                        computeVirial,
+        void integrate(real                              dt,
+                       bool                              updateVelocities,
+                       bool                              computeVirial,
                         tensor                            virial,
-                       const bool                        doTempCouple,
+                       bool                              doTempCouple,
                         gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                       const bool                        doPressureCouple,
-                       const float                       dtPressureCouple,
+                       bool                              doPressureCouple,
+                       float                             dtPressureCouple,
                         const matrix                      velocityScalingMatrix);
  
-        /*! \brief
-         * Update data-structures (e.g. after NB search step).
+        /*! \brief Set the pointers and update data-structures (e.g. after NB search step).
           *
-         * \param[in] idef                 System topology
-         * \param[in] md                   Atoms data.
-         * \param[in] numTempScaleValues   Number of temperature scaling groups. Set zero for no temperature coupling.
+         * \param[in,out]  d_x            Device buffer with coordinates.
+         * \param[in,out]  d_v            Device buffer with velocities.
+         * \param[in]      d_f            Device buffer with forces.
+         * \param[in] idef                System topology
+         * \param[in] md                  Atoms data.
+         * \param[in] numTempScaleValues  Number of temperature scaling groups. Set zero for no temperature coupling.
           */
-        void set(const t_idef    &idef,
-                 const t_mdatoms &md,
-                 const int        numTempScaleValues);
+        void set(DeviceBuffer<float>        d_x,
+                 DeviceBuffer<float>        d_v,
+                 const DeviceBuffer<float>  d_f,
+                 const t_idef              &idef,
+                 const t_mdatoms           &md,
+                 const int                  numTempScaleValues);
  
          /*! \brief
           * Update PBC data.
@@ -127,73 +132,6 @@ class UpdateConstrainCuda::Impl
           */
          void setPbc(const t_pbc *pbc);
  
-        /*! \brief
-         * Copy coordinates from CPU to GPU.
-         *
-         * The data are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_x  CPU pointer where coordinates should be copied from.
-         */
-        void copyCoordinatesToGpu(const rvec *h_x);
-
-        /*! \brief
-         * Copy velocities from CPU to GPU.
-         *
-         * The data are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_v  CPU pointer where velocities should be copied from.
-         */
-        void copyVelocitiesToGpu(const rvec *h_v);
-
-        /*! \brief
-         * Copy forces from CPU to GPU.
-         *
-         * The data are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_f  CPU pointer where forces should be copied from.
-         */
-        void copyForcesToGpu(const rvec *h_f);
-
-        /*! \brief
-         * Copy coordinates from GPU to CPU.
-         *
-         * The data are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[out] h_xp CPU pointer where coordinates should be copied to.
-         */
-        void copyCoordinatesFromGpu(rvec *h_xp);
-
-        /*! \brief
-         * Copy velocities from GPU to CPU.
-         *
-         * The velocities are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_v  Pointer to velocities data.
-         */
-        void copyVelocitiesFromGpu(rvec *h_v);
-
-        /*! \brief
-         * Copy forces from GPU to CPU.
-         *
-         * The forces are assumed to be in float3/fvec format (single precision).
-         *
-         * \param[in] h_f  Pointer to forces data.
-         */
-        void copyForcesFromGpu(rvec *h_f);
-
-        /*! \brief
-         * Set the internal GPU-memory x, xprime and v pointers.
-         *
-         * Data is not copied. The data are assumed to be in float3/fvec format
-         * (float3 is used internally, but the data layout should be identical).
-         *
-         * \param[in] d_x   Pointer to the coordinates for the input (on GPU)
-         * \param[in] d_xp  Pointer to the coordinates for the output (on GPU)
-         * \param[in] d_v   Pointer to the velocities (on GPU)
-         * \param[in] d_f   Pointer to the forces (on GPU)
-         */
-        void setXVFPointers(rvec *d_x, rvec *d_xp, rvec *d_v, rvec *d_f);
-
      private:
  
          //! CUDA stream
@@ -205,33 +143,20 @@ class UpdateConstrainCuda::Impl
          //! Number of atoms
          int                 numAtoms_;
  
-        //! Coordinates before the timestep (on GPU)
+        //! Local copy of the pointer to the device positions buffer
          float3             *d_x_;
-        //! Number of elements in coordinates buffer
-        int                 numX_                  = -1;
-        //! Allocation size for the coordinates buffer
-        int                 numXAlloc_             = -1;
+        //! Local copy of the pointer to the device velocities buffer
+        float3             *d_v_;
+        //! Local copy of the pointer to the device forces buffer
+        float3             *d_f_;
  
-        //! Coordinates after the timestep (on GPU).
+        //! Device buffer for intermediate positions (maintained internally)
          float3             *d_xp_;
          //! Number of elements in shifted coordinates buffer
          int                 numXp_                 = -1;
          //! Allocation size for the shifted coordinates buffer
          int                 numXpAlloc_            = -1;
  
-        //! Velocities of atoms (on GPU)
-        float3             *d_v_;
-        //! Number of elements in velocities buffer
-        int                 numV_                  = -1;
-        //! Allocation size for the velocities buffer
-        int                 numVAlloc_             = -1;
-
-        //! Forces, exerted by atoms (on GPU)
-        float3             *d_f_;
-        //! Number of elements in forces buffer
-        int                 numF_                  = -1;
-        //! Allocation size for the forces buffer
-        int                 numFAlloc_             = -1;
  
          //! 1/mass for all atoms (GPU)
          real               *d_inverseMasses_;
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp

index 370cc511752dce9b7234dbe0c1026278f630f686..99f2a4f0de72ac37dba5f2bb6b090e387e47b9cc 100644 (file)
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -117,7 +117,9 @@
  #include "gromacs/mdtypes/observableshistory.h"
  #include "gromacs/mdtypes/pullhistory.h"
  #include "gromacs/mdtypes/state.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
  #include "gromacs/modularsimulator/energyelement.h"
+#include "gromacs/nbnxm/gpu_data_mgmt.h"
  #include "gromacs/nbnxm/nbnxm.h"
  #include "gromacs/pbcutil/mshift.h"
  #include "gromacs/pbcutil/pbc.h"
@@ -315,8 +317,15 @@ void gmx::LegacySimulator::do_md()
          upd.setNumAtoms(state->natoms);
      }
  
+/*****************************************************************************************/
+// TODO: The following block of code should be refactored, once:
+//       1. We have the useGpuForBufferOps variable set and available here and in do_force(...)
+//       2. The proper GPU syncronization is introduced, so that the H2D and D2H data copies can be performed in the separate
+//          stream owned by the StatePropagatorDataGpu
      bool useGpuForPme       = (fr->pmedata != nullptr) && (pme_run_mode(fr->pmedata) != PmeRunMode::CPU);
      bool useGpuForNonbonded = fr->nbv->useGpu();
+    // Temporary solution to make sure that the buffer ops are offloaded when update is offloaded
+    bool useGpuForBufferOps   = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
  
      if (useGpuForUpdate)
      {
@@ -346,10 +355,19 @@ void gmx::LegacySimulator::do_md()
          integrator = std::make_unique<UpdateConstrainCuda>(*ir, *top_global, nullptr);
      }
  
-    if (fr->nbv->useGpu())
+    if (useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
      {
-        changePinningPolicy(&state->x, gmx::PinningPolicy::PinnedIfSupported);
+        changePinningPolicy(&state->x, PinningPolicy::PinnedIfSupported);
      }
+    if ((useGpuForNonbonded && useGpuForBufferOps) || useGpuForUpdate)
+    {
+        changePinningPolicy(&f, PinningPolicy::PinnedIfSupported);
+    }
+    if (useGpuForUpdate)
+    {
+        changePinningPolicy(&state->v, PinningPolicy::PinnedIfSupported);
+    }
+/*****************************************************************************************/
  
      // NOTE: The global state is no longer used at this point.
      // But state_global is still used as temporary storage space for writing
@@ -1200,16 +1218,19 @@ void gmx::LegacySimulator::do_md()
  
          if (useGpuForUpdate)
          {
+            StatePropagatorDataGpu *stateGpu = fr->stateGpu;
              if (bNS)
              {
-                integrator->set(top.idef, *mdatoms, ekind->ngtc);
+                integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(), stateGpu->getForces(),
+                                top.idef, *mdatoms, ekind->ngtc);
                  t_pbc pbc;
                  set_pbc(&pbc, epbcXYZ, state->box);
                  integrator->setPbc(&pbc);
              }
-            integrator->copyCoordinatesToGpu(state->x.rvec_array());
-            integrator->copyVelocitiesToGpu(state->v.rvec_array());
-            integrator->copyForcesToGpu(as_rvec_array(f.data()));
+
+            stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
+            stateGpu->copyVelocitiesToGpu(state->v, StatePropagatorDataGpu::AtomLocality::All);
+            stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), StatePropagatorDataGpu::AtomLocality::All);
  
              bool doTempCouple     = (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
              bool doPressureCouple = (ir->epc == epcPARRINELLORAHMAN && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
@@ -1218,9 +1239,9 @@ void gmx::LegacySimulator::do_md()
              integrator->integrate(ir->delta_t, true, bCalcVir, shake_vir,
                                    doTempCouple, ekind->tcstat,
                                    doPressureCouple, ir->nstpcouple*ir->delta_t, M);
-
-            integrator->copyCoordinatesFromGpu(state->x.rvec_array());
-            integrator->copyVelocitiesFromGpu(state->v.rvec_array());
+            stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), StatePropagatorDataGpu::AtomLocality::All);
+            stateGpu->copyVelocitiesFromGpu(state->v, StatePropagatorDataGpu::AtomLocality::All);
+            stateGpu->synchronizeStream();
          }
          else
          {
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 1793b91d542cdc75b7ad0fc5dc5b27bd1f28916a..7bf64f9185444191dee340349ccff184be3c39c7 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -115,6 +115,7 @@
  #include "gromacs/mdtypes/observableshistory.h"
  #include "gromacs/mdtypes/simulation_workload.h"
  #include "gromacs/mdtypes/state.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
  #include "gromacs/nbnxm/gpu_data_mgmt.h"
  #include "gromacs/nbnxm/nbnxm.h"
  #include "gromacs/nbnxm/pairlist_tuning.h"
@@ -1501,6 +1502,27 @@ int Mdrunner::mdrunner()
                                                           fcd->orires.nr != 0,
                                                           fcd->disres.nsystems != 0);
  
+        const void *commandStream = ((GMX_GPU == GMX_GPU_OPENCL) && thisRankHasPmeGpuTask) ? pme_gpu_get_device_stream(fr->pmedata) : nullptr;
+        const void *gpuContext    = ((GMX_GPU == GMX_GPU_OPENCL) && thisRankHasPmeGpuTask) ? pme_gpu_get_device_context(fr->pmedata) : nullptr;
+        const int   paddingSize   = pme_gpu_get_padding_size(fr->pmedata);
+
+        const bool  inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
+                    false,
+                    inputrec, doRerun, vsite.get(), ms, replExParams,
+                    fcd, static_cast<int>(filenames.size()), filenames.data(),
+                    &observablesHistory, membed);
+
+        const bool          useModularSimulator = inputIsCompatibleWithModularSimulator && !(getenv("GMX_DISABLE_MODULAR_SIMULATOR") != nullptr);
+        GpuApiCallBehavior  transferKind        = (inputrec->eI == eiMD && !doRerun && !useModularSimulator) ? GpuApiCallBehavior::Async : GpuApiCallBehavior::Sync;
+
+        // We initialize GPU state even for the CPU runs so we will have a more verbose
+        // error if someone will try accessing it from the CPU codepath
+        gmx::StatePropagatorDataGpu stateGpu(commandStream,
+                                             gpuContext,
+                                             transferKind,
+                                             paddingSize);
+        fr->stateGpu = &stateGpu;
+
          // TODO This is not the right place to manage the lifetime of
          // this data structure, but currently it's the easiest way to
          // make it work.
@@ -1510,11 +1532,6 @@ int Mdrunner::mdrunner()
          SimulatorBuilder simulatorBuilder;
  
          // build and run simulator object based on user-input
-        const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
-                    false,
-                    inputrec, doRerun, vsite.get(), ms, replExParams,
-                    fcd, static_cast<int>(filenames.size()), filenames.data(),
-                    &observablesHistory, membed);
          auto simulator = simulatorBuilder.build(
                      inputIsCompatibleWithModularSimulator,
                      fplog, cr, ms, mdlog, static_cast<int>(filenames.size()), filenames.data(),
diff --git a/src/gromacs/mdtypes/CMakeLists.txt b/src/gromacs/mdtypes/CMakeLists.txt

index 788b6cf3bcbd325af8c47f678496a36eee39e5cd..f9adf04c23b78fa5697affb9a185858a0de7bad1 100644 (file)
--- a/src/gromacs/mdtypes/CMakeLists.txt
+++ b/src/gromacs/mdtypes/CMakeLists.txt
@@ -32,7 +32,31 @@
  # To help us fund GROMACS development, we humbly ask that you cite
  # the research papers on the package. Check out http://www.gromacs.org.
  
-file(GLOB MDTYPES_SOURCES *.cpp)
+file(GLOB MDTYPES_SOURCES
+    df_history.cpp
+    group.cpp
+    iforceprovider.cpp
+    inputrec.cpp
+    md_enums.cpp
+    observableshistory.cpp
+    state.cpp)
+
+if(GMX_USE_CUDA OR GMX_USE_OPENCL)
+    gmx_add_libgromacs_sources(
+       state_propagator_data_gpu_impl_gpu.cpp
+       )
+    if(GMX_USE_CUDA)
+        gmx_compile_cpp_as_cuda(
+            state_propagator_data_gpu_impl_gpu.cpp
+            )
+    endif()
+else()
+    gmx_add_libgromacs_sources(
+      state_propagator_data_gpu_impl.cpp
+      )
+endif()
+
+
  set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${MDTYPES_SOURCES} PARENT_SCOPE)
  
  if(GMX_INSTALL_LEGACY_API)
diff --git a/src/gromacs/mdtypes/forcerec.h b/src/gromacs/mdtypes/forcerec.h

index ec01003d243b5308c65a4d29f91202b8d11075e3..540d0d073e4983d070cae4e7dacc013b37cdd30a 100644 (file)
--- a/src/gromacs/mdtypes/forcerec.h
+++ b/src/gromacs/mdtypes/forcerec.h
@@ -60,6 +60,7 @@ namespace gmx
  {
  class GpuBonded;
  class ForceProviders;
+class StatePropagatorDataGpu;
  }
  
  /* macros for the cginfo data in forcerec
@@ -268,6 +269,11 @@ struct t_forcerec { // NOLINT (clang-analyzer-optin.performance.Padding)
      struct ewald_corr_thread_t *ewc_t       = nullptr;
  
      gmx::ForceProviders        *forceProviders = nullptr;
+
+    // The stateGpu object is created in runner, forcerec just keeps the copy of the pointer.
+    // TODO: This is not supposed to be here. StatePropagatorDataGpu should be a part of
+    //       general StatePropagatorData object that is passed around
+    gmx::StatePropagatorDataGpu  *stateGpu = nullptr;
  };
  
  /* Important: Starting with Gromacs-4.6, the values of c6 and c12 in the nbfp array have
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h

new file mode 100644 (file)

index 0000000..cf20737
--- /dev/null
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -0,0 +1,225 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declaration of interfaces for GPU state data propagator object.
+ *
+ * This object stores and manages positions, velocities and forces for
+ * all particles in the system on the GPU.
+ *
+ * \todo Add cycle counters.
+ * \todo Add synchronization points.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \inlibraryapi
+ * \ingroup module_mdtypes
+ */
+#ifndef GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_H
+#define GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_H
+
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/utility/arrayref.h"
+#include "gromacs/utility/classhelpers.h"
+
+namespace gmx
+{
+
+class StatePropagatorDataGpu
+{
+    public:
+
+        /*! \brief Atom locality indicator: local, non-local, all.
+         *
+         * \todo This should be managed by a separate object, since the localities
+         *       are used here and in buffer ops.
+         */
+        enum class AtomLocality : int
+        {
+            Local    = 0, //!< Local atoms
+            NonLocal = 1, //!< Non-local atoms
+            All      = 2, //!< Both local and non-local atoms
+            Count    = 3  //!< The number of atom locality types
+        };
+
+        /*! \brief Constructor
+         *
+         * The buffers are reallocated only at the reinit call, the padding is
+         * used there for the coordinates buffer. It is needed for PME and added at
+         * the end of the buffer. It is assumed that if the rank has PME duties on the
+         * GPU, all coordinates are copied to the GPU and hence, for this rank, the
+         * coordinates buffer is not split into local and non-local ranges. For other
+         * ranks, the padding size is zero. This works because only one rank ever does
+         * PME work on the GPU, and if that rank also does PP work that is the only
+         * rank. So all coordinates are always transferred.
+         *
+         * \note \p commandStream and \p gpuContext are allowed to be nullptr if
+         *       StatePropagatorDataGpu is not used in the OpenCL run (e.g. if PME
+         *       does not run on the GPU).
+         *
+         * \todo Make \p CommandStream visible in the CPU parts of the code so we
+         *       will not have to pass a void*.
+         * \todo Make \p Context visible in CPU parts of the code so we will not
+         *       have to pass a void*.
+         *
+         *  \param[in] commandStream  GPU stream, nullptr allowed.
+         *  \param[in] gpuContext     GPU context, nullptr allowed.
+         *  \param[in] transferKind   H2D/D2H transfer call behavior (synchronous or not).
+         *  \param[in] paddingSize    Padding size for coordinates buffer.
+         */
+        StatePropagatorDataGpu(const void        *commandStream,
+                               const void        *gpuContext,
+                               GpuApiCallBehavior transferKind,
+                               int                paddingSize);
+
+        ~StatePropagatorDataGpu();
+
+        /*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+         *
+         * The coordinates buffer is reallocated with the padding added at the end. The
+         * size of padding is set by the constructor.
+         *
+         *  \param[in] numAtomsLocal  Number of atoms in local domain.
+         *  \param[in] numAtomsAll    Total number of atoms to handle.
+         */
+        void reinit(int numAtomsLocal, int numAtomsAll);
+
+        /*! \brief Returns the range of atoms to be copied based on the copy type (all, local or non-local).
+         *
+         * \todo There are at least three versions of the function with this functionality in the code:
+         *       this one and two more in NBNXM. These should be unified in a shape of a general function
+         *       in DD.
+         *
+         * \param[in]  atomLocality    If all, local or non-local ranges are needed.
+         *
+         * \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range.
+         */
+        std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality  atomLocality);
+
+
+        /*! \brief Get the positions buffer on the GPU.
+         *
+         *  \returns GPU positions buffer.
+         */
+        DeviceBuffer<float> getCoordinates();
+
+        /*! \brief Copy positions to the GPU memory.
+         *
+         *  \param[in] h_x           Positions in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyCoordinatesToGpu(gmx::ArrayRef<const gmx::RVec>  h_x,
+                                  AtomLocality                    atomLocality);
+
+        /*! \brief Copy positions from the GPU memory.
+         *
+         *  \param[in] h_x           Positions buffer in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec>  h_x,
+                                    AtomLocality              atomLocality);
+
+
+        /*! \brief Get the velocities buffer on the GPU.
+         *
+         *  \returns GPU velocities buffer.
+         */
+        DeviceBuffer<float> getVelocities();
+
+        /*! \brief Copy velocities to the GPU memory.
+         *
+         *  \param[in] h_v           Velocities in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyVelocitiesToGpu(gmx::ArrayRef<const gmx::RVec>  h_v,
+                                 AtomLocality                    atomLocality);
+
+        /*! \brief Copy velocities from the GPU memory.
+         *
+         *  \param[in] h_v           Velocities buffer in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec>  h_v,
+                                   AtomLocality              atomLocality);
+
+
+        /*! \brief Get the force buffer on the GPU.
+         *
+         *  \returns GPU force buffer.
+         */
+        DeviceBuffer<float> getForces();
+
+        /*! \brief Copy forces to the GPU memory.
+         *
+         *  \param[in] h_f           Forces in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec>  h_f,
+                             AtomLocality                    atomLocality);
+
+        /*! \brief Copy forces from the GPU memory.
+         *
+         *  \param[in] h_f           Forces buffer in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyForcesFromGpu(gmx::ArrayRef<gmx::RVec>  h_f,
+                               AtomLocality              atomLocality);
+        /*! \brief Synchronize the underlying GPU stream
+         */
+        void synchronizeStream();
+
+        /*! \brief Getter for the number of local atoms.
+         *
+         *  \returns The number of local atoms.
+         */
+        int numAtomsLocal();
+
+        /*! \brief Getter for the total number of atoms.
+         *
+         *  \returns The total number of atoms.
+         */
+        int numAtomsAll();
+
+    private:
+        class Impl;
+        gmx::PrivateImplPointer<Impl> impl_;
+
+};
+
+}      // namespace gmx
+
+#endif // GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_H
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp

new file mode 100644 (file)

index 0000000..dd0aa76
--- /dev/null
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -0,0 +1,156 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief The CPU stub for the state propagator data class.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdtypes
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
+
+#if GMX_GPU == GMX_GPU_NONE
+namespace gmx
+{
+
+class StatePropagatorDataGpu::Impl
+{
+};
+
+StatePropagatorDataGpu::StatePropagatorDataGpu(const void *       /* commandStream */,
+                                               const void *       /* gpuContext    */,
+                                               GpuApiCallBehavior /* transferKind  */,
+                                               int                /* paddingSize   */)
+    : impl_(nullptr)
+{
+}
+
+StatePropagatorDataGpu::~StatePropagatorDataGpu()
+{
+}
+
+void StatePropagatorDataGpu::reinit(int  /* numAtomsLocal */,
+                                    int  /* numAtomsAll   */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality  /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    return std::make_tuple(0, 0);
+}
+
+DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    return DeviceBuffer<float> {};
+}
+
+void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec>  /* h_x          */,
+                                                  AtomLocality                          /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec>  /* h_x          */,
+                                                    AtomLocality              /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    return DeviceBuffer<float> {};
+}
+
+void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec>  /* h_v          */,
+                                                 AtomLocality                          /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+void StatePropagatorDataGpu::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec>  /* h_v          */,
+                                                   AtomLocality              /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::getForces()
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    return DeviceBuffer<float> {};
+}
+
+void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec>  /* h_f          */,
+                                             AtomLocality                          /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec>  /* h_f          */,
+                                               AtomLocality              /* atomLocality */)
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+void StatePropagatorDataGpu::synchronizeStream()
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+}
+
+int StatePropagatorDataGpu::numAtomsLocal()
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    return 0;
+}
+
+int StatePropagatorDataGpu::numAtomsAll()
+{
+    GMX_ASSERT(false, "A CPU stub method from GPU state propagator data was called insted of one from GPU implementation.");
+    return 0;
+}
+
+}      // namespace gmx
+
+#endif // GMX_GPU == GMX_GPU_NONE
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h

new file mode 100644 (file)

index 0000000..fe8dca0
--- /dev/null
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -0,0 +1,273 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declaration of low-level functions and fields of GPU state propagator object.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdtypes
+ */
+#ifndef GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_IMPL_H
+#define GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_IMPL_H
+
+#include "gmxpre.h"
+
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
+#include "gromacs/utility/classhelpers.h"
+
+namespace gmx
+{
+
+class StatePropagatorDataGpu::Impl
+{
+    public:
+
+        Impl();
+
+
+        /*! \brief Constructor
+         *
+         * The buffers are reallocated only at the reinit call, the padding is
+         * used there for the coordinates buffer. It is needed for PME and added at
+         * the end of the buffer. It is assumed that if the rank has PME duties on the
+         * GPU, all coordinates are copied to the GPU and hence, for this rank, the
+         * coordinates buffer is not split into local and non-local ranges. For other
+         * ranks, the padding size is zero. This works because only one rank ever does
+         * PME work on the GPU, and if that rank also does PP work that is the only
+         * rank. So all coordinates are always transferred.
+         *
+         * \note \p commandStream and \p gpuContext are allowed to be nullptr if
+         *       StatePropagatorDataGpu is not used in the OpenCL run (e.g. if PME
+         *       does not run on the GPU).
+         *
+         * \todo Make CommandStream visible in the CPU parts of the code so we
+         *       will not have to pass a void*.
+         * \todo Make a Context object visible in CPU parts of the code so we
+         *       will not have to pass a void*.
+         *
+         *  \param[in] commandStream  GPU stream, nullptr allowed.
+         *  \param[in] gpuContext     GPU context, nullptr allowed.
+         *  \param[in] transferKind   H2D/D2H transfer call behavior (synchronous or not).
+         *  \param[in] paddingSize    Padding size for coordinates buffer.
+         */
+        Impl(const void        *commandStream,
+             const void        *gpuContext,
+             GpuApiCallBehavior transferKind,
+             int                paddingSize);
+
+        ~Impl();
+
+
+        /*! \brief Set the ranges for local and non-local atoms and reallocates buffers.
+         *
+         * The coordinates buffer is reallocated with the padding added at the end. The
+         * size of padding is set by the constructor.
+         *
+         *  \param[in] numAtomsLocal  Number of atoms in local domain.
+         *  \param[in] numAtomsAll    Total number of atoms to handle.
+         */
+        void reinit(int numAtomsLocal, int numAtomsAll);
+
+        /*! \brief Returns the range of atoms to be copied based on the copy type (all, local or non-local).
+         *
+         * \todo There are at least three versions of the function with this functionality in the code:
+         *       this one and two more in NBNXM. These should be unified in a shape of a general function
+         *       in DD.
+         *
+         * \param[in]  atomLocality    If all, local or non-local ranges are needed.
+         *
+         * \returns Tuple, containing the index of the first atom in the range and the total number of atoms in the range.
+         */
+        std::tuple<int, int> getAtomRangesFromAtomLocality(AtomLocality  atomLocality);
+
+
+        /*! \brief Get the positions buffer on the GPU.
+         *
+         *  \returns GPU positions buffer.
+         */
+        DeviceBuffer<float> getCoordinates();
+
+        /*! \brief Copy positions to the GPU memory.
+         *
+         *  \param[in] h_x           Positions in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyCoordinatesToGpu(gmx::ArrayRef<const gmx::RVec>  h_x,
+                                  AtomLocality                    atomLocality);
+
+        /*! \brief Copy positions from the GPU memory.
+         *
+         *  \param[in] h_x           Positions buffer in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec>  h_x,
+                                    AtomLocality              atomLocality);
+
+
+        /*! \brief Get the velocities buffer on the GPU.
+         *
+         *  \returns GPU velocities buffer.
+         */
+        DeviceBuffer<float> getVelocities();
+
+        /*! \brief Copy velocities to the GPU memory.
+         *
+         *  \param[in] h_v           Velocities in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyVelocitiesToGpu(gmx::ArrayRef<const gmx::RVec>  h_v,
+                                 AtomLocality                    atomLocality);
+
+        /*! \brief Copy velocities from the GPU memory.
+         *
+         *  \param[in] h_v           Velocities buffer in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec>  h_v,
+                                   AtomLocality              atomLocality);
+
+
+        /*! \brief Get the force buffer on the GPU.
+         *
+         *  \returns GPU force buffer.
+         */
+        DeviceBuffer<float> getForces();
+
+        /*! \brief Copy forces to the GPU memory.
+         *
+         *  \param[in] h_f           Forces in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyForcesToGpu(gmx::ArrayRef<const gmx::RVec>  h_f,
+                             AtomLocality                    atomLocality);
+
+        /*! \brief Copy forces from the GPU memory.
+         *
+         *  \param[in] h_f           Forces buffer in the host memory.
+         *  \param[in] atomLocality  Locality of the particles to copy.
+         */
+        void copyForcesFromGpu(gmx::ArrayRef<gmx::RVec>  h_f,
+                               AtomLocality              atomLocality);
+
+        /*! \brief Synchronize the underlying GPU stream
+         */
+        void synchronizeStream();
+
+        /*! \brief Getter for the number of local atoms.
+         *
+         *  \returns The number of local atoms.
+         */
+        int numAtomsLocal();
+
+        /*! \brief Getter for the total number of atoms.
+         *
+         *  \returns The total number of atoms.
+         */
+        int numAtomsAll();
+
+    private:
+
+        /*! \brief GPU stream.
+         * \todo The stream should be set to non-nullptr once the synchronization points are restored
+         */
+        CommandStream        commandStream_              = nullptr;
+        /*! \brief GPU context (for OpenCL builds)
+         * \todo Make a Context class usable in CPU code
+         */
+        Context              gpuContext_                 = nullptr;
+        //! Default GPU calls behavior
+        GpuApiCallBehavior   transferKind_               = GpuApiCallBehavior::Async;
+        //! Padding size for the coordinates buffer
+        int                  paddingSize_                = 0;
+
+        //! Number of local atoms
+        int                  numAtomsLocal_              = -1;
+        //! Total number of atoms
+        int                  numAtomsAll_                = -1;
+
+        //! Device positions buffer
+        DeviceBuffer<float>  d_x_;
+        //! Number of particles saved in the positions buffer
+        int                  d_xSize_                    = -1;
+        //! Allocation size for the positions buffer
+        int                  d_xCapacity_                = -1;
+
+        //! Device velocities buffer
+        DeviceBuffer<float>  d_v_;
+        //! Number of particles saved in the velocities buffer
+        int                  d_vSize_                    = -1;
+        //! Allocation size for the velocities buffer
+        int                  d_vCapacity_                = -1;
+
+        //! Device force buffer
+        DeviceBuffer<float>  d_f_;
+        //! Number of particles saved in the force buffer
+        int                  d_fSize_                    = -1;
+        //! Allocation size for the force buffer
+        int                  d_fCapacity_                = -1;
+
+        /*! \brief Performs the copy of data from host to device buffer.
+         *
+         * \todo Template on locality.
+         *
+         * \param[in,out]  d_data        Device-side buffer.
+         * \param[in,out]  h_data        Host-side buffer.
+         * \param[in]      dataSize      Device-side data allocation size.
+         * \param[in]      atomLocality  If all, local or non-local ranges should be copied.
+         */
+        void copyToDevice(DeviceBuffer<float>                   d_data,
+                          const gmx::ArrayRef<const gmx::RVec>  h_data,
+                          int                                   dataSize,
+                          AtomLocality                          atomLocality);
+
+        /*! \brief Performs the copy of data from device to host buffer.
+         *
+         * \param[in,out]  h_data        Host-side buffer.
+         * \param[in,out]  d_data        Device-side buffer.
+         * \param[in]      dataSize      Device-side data allocation size.
+         * \param[in]      atomLocality  If all, local or non-local ranges should be copied.
+         */
+        void copyFromDevice(gmx::ArrayRef<gmx::RVec>  h_data,
+                            DeviceBuffer<float>       d_data,
+                            int                       dataSize,
+                            AtomLocality              atomLocality);
+};
+
+}      // namespace gmx
+
+#endif // GMX_MDTYPES_STATE_PROPAGATOR_DATA_GPU_IMPL_H
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

new file mode 100644 (file)

index 0000000..d2c1e5d
--- /dev/null
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -0,0 +1,381 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Definitions of interfaces for GPU state data propagator object.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdtypes
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#if GMX_GPU != GMX_GPU_NONE
+
+#if GMX_GPU == GMX_GPU_CUDA
+#include "gromacs/gpu_utils/cudautils.cuh"
+#endif
+#include "gromacs/gpu_utils/devicebuffer.h"
+#if GMX_GPU == GMX_GPU_OPENCL
+#include "gromacs/gpu_utils/oclutils.h"
+#endif
+#include "gromacs/math/vectypes.h"
+#include "gromacs/mdtypes/state_propagator_data_gpu.h"
+#include "gromacs/utility/classhelpers.h"
+
+#include "state_propagator_data_gpu_impl.h"
+
+namespace gmx
+{
+
+StatePropagatorDataGpu::Impl::Impl(gmx_unused const void *commandStream,
+                                   gmx_unused const void *gpuContext,
+                                   GpuApiCallBehavior     transferKind,
+                                   int                    paddingSize) :
+    transferKind_(transferKind),
+    paddingSize_(paddingSize)
+{
+
+    GMX_RELEASE_ASSERT(getenv("GMX_USE_GPU_BUFFER_OPS") == nullptr, "GPU buffer ops are not supported in this build.");
+
+    // Set the stream-context pair for the OpenCL builds,
+    // use the nullptr stream for CUDA builds
+#if GMX_GPU == GMX_GPU_OPENCL
+    if (commandStream != nullptr)
+    {
+        commandStream_ = *static_cast<const CommandStream*>(commandStream);
+    }
+    if (gpuContext != nullptr)
+    {
+        gpuContext_ = *static_cast<const Context*>(gpuContext);
+    }
+#endif
+
+}
+
+StatePropagatorDataGpu::Impl::~Impl()
+{
+}
+
+void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
+{
+#if GMX_GPU == GMX_GPU_OPENCL
+    GMX_ASSERT(gpuContext_ != nullptr, "GPU context should be set in OpenCL builds.");
+#endif
+    numAtomsLocal_ = numAtomsLocal;
+    numAtomsAll_   = numAtomsAll;
+
+    int numAtomsPadded;
+    if (paddingSize_ > 0)
+    {
+        numAtomsPadded = ((numAtomsAll_ + paddingSize_ - 1 ) / paddingSize_ )*paddingSize_;
+    }
+    else
+    {
+        numAtomsPadded = numAtomsAll_;
+    }
+
+    reallocateDeviceBuffer(&d_x_, DIM*numAtomsPadded, &d_xSize_, &d_xCapacity_, gpuContext_);
+
+    const size_t paddingAllocationSize = numAtomsPadded - numAtomsAll_;
+    if (paddingAllocationSize > 0)
+    {
+        clearDeviceBufferAsync(&d_x_, DIM*numAtomsAll_, DIM*paddingAllocationSize, commandStream_);
+    }
+
+    reallocateDeviceBuffer(&d_v_, DIM*numAtomsAll_, &d_vSize_, &d_vCapacity_, gpuContext_);
+    reallocateDeviceBuffer(&d_f_, DIM*numAtomsAll_, &d_fSize_, &d_fCapacity_, gpuContext_);
+
+}
+
+std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality  atomLocality)
+{
+    int atomsStartAt   = 0;
+    int numAtomsToCopy = 0;
+    switch (atomLocality)
+    {
+        case AtomLocality::All:
+            atomsStartAt    = 0;
+            numAtomsToCopy  = numAtomsAll_;
+            break;
+        case AtomLocality::Local:
+            atomsStartAt    = 0;
+            numAtomsToCopy  = numAtomsLocal_;
+            break;
+        case AtomLocality::NonLocal:
+            atomsStartAt    = numAtomsLocal_;
+            numAtomsToCopy  = numAtomsAll_ - numAtomsLocal_;
+            break;
+        default:
+            GMX_RELEASE_ASSERT(false, "Wrong range of atoms requested in GPU state data manager. Should be All, Local or NonLocal.");
+    }
+    GMX_ASSERT(atomsStartAt   >= 0, "The first elemtnt to copy has negative index. Probably, the GPU propagator state was not initialized.");
+    GMX_ASSERT(numAtomsToCopy >= 0, "Number of atoms to copy is negative. Probably, the GPU propagator state was not initialized.");
+    return std::make_tuple(atomsStartAt, numAtomsToCopy);
+}
+
+void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<float>                   d_data,
+                                                const gmx::ArrayRef<const gmx::RVec>  h_data,
+                                                int                                   dataSize,
+                                                AtomLocality                          atomLocality)
+{
+
+#if GMX_GPU == GMX_GPU_OPENCL
+    GMX_ASSERT(gpuContext_ != nullptr, "GPU context should be set in OpenCL builds.");
+#endif
+
+    GMX_UNUSED_VALUE(dataSize);
+
+    GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
+
+    int atomsStartAt, numAtomsToCopy;
+    std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
+
+    int elementsStartAt   = atomsStartAt*DIM;
+    int numElementsToCopy = numAtomsToCopy*DIM;
+
+    if (numAtomsToCopy != 0)
+    {
+        GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize, "The device allocation is smaller than requested copy range.");
+        GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(), "The host buffer is smaller than the requested copy range.");
+
+        // TODO: Use the proper stream
+        copyToDeviceBuffer(&d_data, reinterpret_cast<const float *>(&h_data.data()[atomsStartAt]),
+                           elementsStartAt, numElementsToCopy,
+                           commandStream_, transferKind_, nullptr);
+    }
+}
+
+void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec>  h_data,
+                                                  DeviceBuffer<float>       d_data,
+                                                  int                       dataSize,
+                                                  AtomLocality              atomLocality)
+{
+
+#if GMX_GPU == GMX_GPU_OPENCL
+    GMX_ASSERT(gpuContext_ != nullptr, "GPU context should be set in OpenCL builds.");
+#endif
+
+    GMX_UNUSED_VALUE(dataSize);
+
+    GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
+
+    int atomsStartAt, numAtomsToCopy;
+    std::tie(atomsStartAt, numAtomsToCopy) = getAtomRangesFromAtomLocality(atomLocality);
+
+    int elementsStartAt   = atomsStartAt*DIM;
+    int numElementsToCopy = numAtomsToCopy*DIM;
+
+    if (numAtomsToCopy != 0)
+    {
+        GMX_ASSERT(elementsStartAt + numElementsToCopy <= dataSize, "The device allocation is smaller than requested copy range.");
+        GMX_ASSERT(atomsStartAt + numAtomsToCopy <= h_data.ssize(), "The host buffer is smaller than the requested copy range.");
+
+        // TODO: Use the proper stream
+        copyFromDeviceBuffer(reinterpret_cast<float*>(&h_data.data()[atomsStartAt]), &d_data,
+                             elementsStartAt, numElementsToCopy,
+                             commandStream_, transferKind_, nullptr);
+
+    }
+}
+
+DeviceBuffer<float> StatePropagatorDataGpu::Impl::getCoordinates()
+{
+    return d_x_;
+}
+
+void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec>  h_x,
+                                                        AtomLocality                          atomLocality)
+{
+    copyToDevice(d_x_, h_x, d_xSize_, atomLocality);
+}
+
+void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec>  h_x,
+                                                          AtomLocality              atomLocality)
+{
+    copyFromDevice(h_x, d_x_, d_xSize_, atomLocality);
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::Impl::getVelocities()
+{
+    return d_v_;
+}
+
+void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec>  h_v,
+                                                       AtomLocality                          atomLocality)
+{
+    copyToDevice(d_v_, h_v, d_vSize_, atomLocality);
+}
+
+void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec>  h_v,
+                                                         AtomLocality              atomLocality)
+{
+    copyFromDevice(h_v, d_v_, d_vSize_, atomLocality);
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::Impl::getForces()
+{
+    return d_f_;
+}
+
+void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec>  h_f,
+                                                   AtomLocality                          atomLocality)
+{
+    copyToDevice(d_f_, h_f, d_fSize_, atomLocality);
+}
+
+void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec>  h_f,
+                                                     AtomLocality              atomLocality)
+{
+    copyFromDevice(h_f, d_f_, d_fSize_, atomLocality);
+}
+
+void StatePropagatorDataGpu::Impl::synchronizeStream()
+{
+    gpuStreamSynchronize(commandStream_);
+}
+
+int StatePropagatorDataGpu::Impl::numAtomsLocal()
+{
+    return numAtomsLocal_;
+}
+
+int StatePropagatorDataGpu::Impl::numAtomsAll()
+{
+    return numAtomsAll_;
+}
+
+
+
+StatePropagatorDataGpu::StatePropagatorDataGpu(const void        *commandStream,
+                                               const void        *gpuContext,
+                                               GpuApiCallBehavior transferKind,
+                                               int                paddingSize)
+    : impl_(new Impl(commandStream,
+                     gpuContext,
+                     transferKind,
+                     paddingSize))
+{
+}
+
+StatePropagatorDataGpu::~StatePropagatorDataGpu() = default;
+
+
+void StatePropagatorDataGpu::reinit(int numAtomsLocal, int numAtomsAll)
+{
+    return impl_->reinit(numAtomsLocal, numAtomsAll);
+}
+
+std::tuple<int, int> StatePropagatorDataGpu::getAtomRangesFromAtomLocality(AtomLocality  atomLocality)
+{
+    return impl_->getAtomRangesFromAtomLocality(atomLocality);
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::getCoordinates()
+{
+    return impl_->getCoordinates();
+}
+
+void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx::RVec>  h_x,
+                                                  AtomLocality                          atomLocality)
+{
+    return impl_->copyCoordinatesToGpu(h_x, atomLocality);
+}
+
+void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<RVec>  h_x,
+                                                    AtomLocality         atomLocality)
+{
+    return impl_->copyCoordinatesFromGpu(h_x, atomLocality);
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::getVelocities()
+{
+    return impl_->getVelocities();
+}
+
+void StatePropagatorDataGpu::copyVelocitiesToGpu(const gmx::ArrayRef<const gmx::RVec>  h_v,
+                                                 AtomLocality                          atomLocality)
+{
+    return impl_->copyVelocitiesToGpu(h_v, atomLocality);
+}
+
+void StatePropagatorDataGpu::copyVelocitiesFromGpu(gmx::ArrayRef<RVec>  h_v,
+                                                   AtomLocality         atomLocality)
+{
+    return impl_->copyVelocitiesFromGpu(h_v, atomLocality);
+}
+
+
+DeviceBuffer<float> StatePropagatorDataGpu::getForces()
+{
+    return impl_->getForces();
+}
+
+void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec>  h_f,
+                                             AtomLocality                          atomLocality)
+{
+    return impl_->copyForcesToGpu(h_f, atomLocality);
+}
+
+void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<RVec>  h_f,
+                                               AtomLocality         atomLocality)
+{
+    return impl_->copyForcesFromGpu(h_f, atomLocality);
+}
+
+void StatePropagatorDataGpu::synchronizeStream()
+{
+    return impl_->synchronizeStream();
+}
+
+int StatePropagatorDataGpu::numAtomsLocal()
+{
+    return impl_->numAtomsLocal();
+}
+
+int StatePropagatorDataGpu::numAtomsAll()
+{
+    return impl_->numAtomsAll();
+}
+
+}      // namespace gmx
+
+#endif // GMX_GPU == GMX_GPU_NONE
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp

index f6df6eb6b48cfbdab76dce4779ae909d91861152..5df9633e2727d723861dc7c2f01eb76e91d8b043 100644 (file)
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -1084,42 +1084,12 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet     &gridSet,
      }
  }
  
-void nbnxn_atomdata_copy_x_to_gpu(const Nbnxm::GridSet     &gridSet,
-                                  const Nbnxm::AtomLocality locality,
-                                  bool                      fillLocal,
-                                  nbnxn_atomdata_t         *nbat,
-                                  gmx_nbnxn_gpu_t          *gpu_nbv,
-                                  const rvec               *coordinatesHost)
-{
-    int gridBegin = 0;
-    int gridEnd   = 0;
-    getAtomRanges(gridSet, locality, &gridBegin, &gridEnd);
-
-    if (fillLocal)
-    {
-        nbat->natoms_local = gridSet.grids()[0].atomIndexEnd();
-    }
-
-    for (int g = gridBegin; g < gridEnd; g++)
-    {
-        nbnxn_gpu_copy_x_to_gpu(gridSet.grids()[g],
-                                gpu_nbv,
-                                locality,
-                                coordinatesHost);
-    }
-}
-
-DeviceBuffer<float> nbnxn_atomdata_get_x_gpu(gmx_nbnxn_gpu_t *gpu_nbv)
-{
-    return Nbnxm::nbnxn_gpu_get_x_gpu(gpu_nbv);
-}
-
  /* Copies (and reorders) the coordinates to nbnxn_atomdata_t on the GPU*/
  void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet     &gridSet,
                                      const Nbnxm::AtomLocality locality,
                                      bool                      fillLocal,
                                      gmx_nbnxn_gpu_t          *gpu_nbv,
-                                    DeviceBuffer<float>       coordinatesDevice)
+                                    DeviceBuffer<float>       d_x)
  {
  
      int gridBegin = 0;
@@ -1131,7 +1101,7 @@ void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet     &gridSet,
          nbnxn_gpu_x_to_nbat_x(gridSet.grids()[g],
                                fillLocal && g == 0,
                                gpu_nbv,
-                              coordinatesDevice,
+                              d_x,
                                locality,
                                g,
                                gridSet.numColumnsMax());
@@ -1552,11 +1522,6 @@ void reduceForcesGpu(const Nbnxm::AtomLocality        locality,
                                       accumulateForce);
  }
  
-DeviceBuffer<float> nbnxn_atomdata_get_f_gpu(gmx_nbnxn_gpu_t *gpu_nbv)
-{
-    return Nbnxm::nbnxn_gpu_get_f_gpu(gpu_nbv);
-}
-
  void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t   &nbat,
                                                gmx::ArrayRef<gmx::RVec>  fshift)
  {
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h

index ba644bd436e093cd022a611e044dc486fddf0f08..412d328bcea5be22cea2621dcf84599294c451e2 100644 (file)
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -58,8 +58,6 @@ struct nonbonded_verlet_t;
  struct t_mdatoms;
  struct tMPI_Atomic;
  
-enum class BufferOpsUseGpu;
-
  class GpuEventSynchronizer;
  
  namespace Nbnxm
@@ -326,46 +324,22 @@ void nbnxn_atomdata_copy_x_to_nbat_x(const Nbnxm::GridSet       &gridSet,
                                       const rvec                 *coordinates,
                                       nbnxn_atomdata_t           *nbat);
  
-/*! \brief Copies the coordinates to the GPU (in plain rvec format)
- *
- *  This function copied data to the gpu so that the transformation to the NBNXM format can be done on the GPU.
- *
- * \param[in] gridSet          The grids data.
- * \param[in] locality         If local or non local coordinates should be copied.
- * \param[in] fillLocal        If the local filler particle coordinates should be zeroed.
- * \param[in] nbat             Data in NBNXM format, used to zero coordinates of filler particles.
- * \param[in] gpu_nbv          The NBNXM GPU data structure.
- * \param[in] coordinatesHost  Coordinates to be copied (in plain rvec format).
- */
-void nbnxn_atomdata_copy_x_to_gpu(const Nbnxm::GridSet     &gridSet,
-                                  Nbnxm::AtomLocality       locality,
-                                  bool                      fillLocal,
-                                  nbnxn_atomdata_t         *nbat,
-                                  gmx_nbnxn_gpu_t          *gpu_nbv,
-                                  const rvec               *coordinatesHost);
-
-/*!\brief Getter for the GPU coordinates buffer
- *
- * \param[in] gpu_nbv  The NBNXM GPU data structure.
- */
-DeviceBuffer<float> nbnxn_atomdata_get_x_gpu(gmx_nbnxn_gpu_t *gpu_nbv);
-
  /*! \brief Transform coordinates to xbat layout on GPU
   *
   * Creates a GPU copy of the coordinates buffer using short-range ordering.
   * As input, uses coordinates in plain rvec format in GPU memory.
   *
- * \param[in]     gridSet            The grids data.
- * \param[in]     locality           If the transformation should be applied to local or non local coordinates.
- * \param[in]     fillLocal          Tells if the local filler particle coordinates should be zeroed.
- * \param[in,out] gpu_nbv            The NBNXM GPU data structure.
- * \param[in]     coordinatesDevice  Coordinates to be copied (in plain rvec format).
+ * \param[in]     gridSet    The grids data.
+ * \param[in]     locality   If the transformation should be applied to local or non local coordinates.
+ * \param[in]     fillLocal  Tells if the local filler particle coordinates should be zeroed.
+ * \param[in,out] gpu_nbv    The NBNXM GPU data structure.
+ * \param[in]     d_x        Coordinates to be copied (in plain rvec format).
   */
  void nbnxn_atomdata_x_to_nbat_x_gpu(const Nbnxm::GridSet     &gridSet,
                                      Nbnxm::AtomLocality       locality,
                                      bool                      fillLocal,
                                      gmx_nbnxn_gpu_t          *gpu_nbv,
-                                    DeviceBuffer<float>       coordinatesDevice);
+                                    DeviceBuffer<float>       d_x);
  
  /*! \brief Add the computed forces to \p f, an internal reduction might be performed as well
   *
@@ -399,16 +373,6 @@ void reduceForcesGpu(Nbnxm::AtomLocality                 locality,
                       bool                                useGpuFPmeReduction,
                       bool                                accumulateForce);
  
-/*!\brief Getter for the GPU forces buffer
- *
- * \todo Will be removed when the buffer management is lifted out of the NBNXM
- *
- * \param[in] gpu_nbv  The NBNXM GPU data structure.
- *
- * \returns Device forces buffer
- */
-DeviceBuffer<float> nbnxn_atomdata_get_f_gpu(gmx_nbnxn_gpu_t *gpu_nbv);
-
  /* Add the fshift force stored in nbat to fshift */
  void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t   &nbat,
                                                gmx::ArrayRef<gmx::RVec>  fshift);
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 10a6f0a0d37b5f1c248869b89188ddf242c0e406..dba1ca8c826f454164b64822733a2fe53bf961b5 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -743,60 +743,11 @@ void cuda_set_cacheconfig()
      }
  }
  
-/* X buffer operations on GPU: copies coordinates to the GPU in rvec format. */
-void nbnxn_gpu_copy_x_to_gpu(const Nbnxm::Grid               &grid,
-                             gmx_nbnxn_gpu_t                 *nb,
-                             const Nbnxm::AtomLocality        locality,
-                             const rvec                      *coordinatesHost)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    bool                       bDoTime = nb->bDoTime;
-
-    Nbnxm::InteractionLocality interactionLoc            = gpuAtomToInteractionLocality(locality);
-    int                        numCopyAtoms              = grid.srcAtomEnd() - grid.srcAtomBegin();
-    int                        copyAtomStart             = grid.srcAtomBegin();
-
-    cudaStream_t               stream  = nb->stream[interactionLoc];
-
-    // empty domain avoid launching zero-byte copy
-    if (numCopyAtoms == 0)
-    {
-        return;
-    }
-    GMX_ASSERT(coordinatesHost,  "Need a valid host pointer");
-
-    if (bDoTime)
-    {
-        nb->timers->xf[locality].nb_h2d.openTimingRegion(stream);
-    }
-
-    rvec       *devicePtrDest = reinterpret_cast<rvec *> (nb->xrvec[copyAtomStart]);
-    const rvec *devicePtrSrc  = reinterpret_cast<const rvec *> (coordinatesHost[copyAtomStart]);
-    copyToDeviceBuffer(&devicePtrDest, devicePtrSrc, 0, numCopyAtoms,
-                       stream, GpuApiCallBehavior::Async, nullptr);
-
-    if (interactionLoc == Nbnxm::InteractionLocality::Local)
-    {
-        nb->xAvailableOnDevice->markEvent(stream);
-    }
-
-    if (bDoTime)
-    {
-        nb->timers->xf[locality].nb_h2d.closeTimingRegion(stream);
-    }
-}
-
-DeviceBuffer<float> nbnxn_gpu_get_x_gpu(gmx_nbnxn_gpu_t *nb)
-{
-    return reinterpret_cast< DeviceBuffer<float> >(nb->xrvec);
-}
-
  /* X buffer operations on GPU: performs conversion from rvec to nb format. */
  void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
                             bool                             setFillerCoords,
                             gmx_nbnxn_gpu_t                 *nb,
-                           DeviceBuffer<float>              coordinatesDevice,
+                           DeviceBuffer<float>              d_x,
                             const Nbnxm::AtomLocality        locality,
                             int                              gridId,
                             int                              numColumnsMax)
@@ -817,7 +768,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
      if (numAtoms != 0)
      {
          // TODO: This will only work with CUDA
-        GMX_ASSERT(coordinatesDevice, "Need a valid device pointer");
+        GMX_ASSERT(d_x, "Need a valid device pointer");
  
          KernelLaunchConfig config;
          config.blockSize[0]     = c_bufOpsThreadsPerBlock;
@@ -839,7 +790,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid               &grid,
                                                                &numColumns,
                                                                &xqPtr,
                                                                &setFillerCoords,
-                                                              &coordinatesDevice,
+                                                              &d_x,
                                                                &d_atomIndices,
                                                                &d_cxy_na,
                                                                &d_cxy_ind,
@@ -920,142 +871,6 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality               atomLocality,
  
  }
  
-DeviceBuffer<float> nbnxn_gpu_get_f_gpu(gmx_nbnxn_gpu_t *nb)
-{
-    return reinterpret_cast< DeviceBuffer<float> >(nb->frvec);
-}
-
-void nbnxn_launch_copy_f_to_gpu(const AtomLocality               atomLocality,
-                                const Nbnxm::GridSet            &gridSet,
-                                gmx_nbnxn_gpu_t                 *nb,
-                                rvec                            *f)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
-    cudaStream_t              stream    = nb->stream[iLocality];
-
-    bool                      bDoTime = nb->bDoTime;
-    cu_timers_t              *t       = nb->timers;
-
-    int                       atomStart = 0, numCopyAtoms = 0;
-
-    nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &numCopyAtoms);
-
-    // Avoiding launching copy with no work
-    if (numCopyAtoms == 0)
-    {
-        return;
-    }
-    GMX_ASSERT(f, "Need a valid f pointer");
-
-    if (bDoTime)
-    {
-        t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
-    }
-
-    rvec       *ptrDest  = reinterpret_cast<rvec *> (nb->frvec[atomStart]);
-    rvec       *ptrSrc   = reinterpret_cast<rvec *> (f[atomStart]);
-    //copyToDeviceBuffer(&ptrDest, ptrSrc, 0, numCopyAtoms,
-    //                   stream, GpuApiCallBehavior::Async, nullptr);
-    //TODO use above API call rather than direct memcpy when force has been implemented in a hostvector
-    cudaMemcpyAsync(ptrDest, ptrSrc, numCopyAtoms*sizeof(rvec), cudaMemcpyHostToDevice,
-                    stream);
-
-    if (bDoTime)
-    {
-        t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
-    }
-
-    return;
-}
-
-void nbnxn_launch_copy_f_from_gpu(const AtomLocality               atomLocality,
-                                  const Nbnxm::GridSet            &gridSet,
-                                  gmx_nbnxn_gpu_t                 *nb,
-                                  rvec                            *f)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
-    cudaStream_t              stream    = nb->stream[iLocality];
-
-    bool                      bDoTime = nb->bDoTime;
-    cu_timers_t              *t       = nb->timers;
-    int                       atomStart, numCopyAtoms;
-
-    nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &numCopyAtoms);
-
-    // Avoiding launching copy with no work
-    if (numCopyAtoms == 0)
-    {
-        return;
-    }
-    GMX_ASSERT(f, "Need a valid f pointer");
-
-    if (bDoTime)
-    {
-        t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
-    }
-
-    GMX_ASSERT(nb->frvec,  "Need a valid nb->frvec pointer");
-    rvec       *ptrDest = reinterpret_cast<rvec *> (f[atomStart]);
-    rvec       *ptrSrc  = reinterpret_cast<rvec *> (nb->frvec[atomStart]);
-    //copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, numCopyAtoms,
-    //                   stream, GpuApiCallBehavior::Async, nullptr);
-    //TODO use above API call rather than direct memcpy when force has been implemented in a hostvector
-    cudaMemcpyAsync(ptrDest, ptrSrc, numCopyAtoms*sizeof(rvec), cudaMemcpyDeviceToHost,
-                    stream);
-
-    if (bDoTime)
-    {
-        t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
-    }
-
-    return;
-}
-
-void nbnxn_launch_copy_x_from_gpu(const AtomLocality               atomLocality,
-                                  const Nbnxm::GridSet            &gridSet,
-                                  gmx_nbnxn_gpu_t                 *nb,
-                                  rvec                            *x)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-    GMX_ASSERT(x,  "Need a valid x pointer");
-
-    const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
-    cudaStream_t              stream    = nb->stream[iLocality];
-
-    bool                      bDoTime = nb->bDoTime;
-    cu_timers_t              *t       = nb->timers;
-    int                       atomStart, nAtoms;
-
-    nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms);
-
-    if (bDoTime)
-    {
-        t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
-    }
-
-    GMX_ASSERT(nb->xrvec,  "Need a valid nb->xrvec pointer");
-    rvec       *ptrDest = reinterpret_cast<rvec *> (x[atomStart]);
-    rvec       *ptrSrc  = reinterpret_cast<rvec *> (nb->xrvec[atomStart]);
-    copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, nAtoms,
-                         stream, GpuApiCallBehavior::Async, stream);
-
-    if (atomLocality == AtomLocality::NonLocal)
-    {
-        nb->xNonLocalCopyD2HDone->markEvent(stream);
-    }
-
-    if (bDoTime)
-    {
-        t->xf[atomLocality].nb_d2h.closeTimingRegion(stream);
-    }
-
-    return;
-}
-
  void nbnxn_wait_for_gpu_force_reduction(const AtomLocality      gmx_unused atomLocality,
                                          gmx_nbnxn_gpu_t                   *nb)
  {
@@ -1069,16 +884,6 @@ void nbnxn_wait_for_gpu_force_reduction(const AtomLocality      gmx_unused atomL
  
  }
  
-void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t *gpu_nbv)
-{
-    return static_cast<void *> (gpu_nbv->xrvec);
-}
-
-void* nbnxn_get_gpu_frvec(gmx_nbnxn_gpu_t *gpu_nbv)
-{
-    return static_cast<void *> (gpu_nbv->frvec);
-}
-
  void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t   *nb)
  {
      return static_cast<void*> (nb->xAvailableOnDevice);
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu

index 1df63f01db93d40e2a8a87bce1c542fb3c8b2d93..3641d5eb5f38d120b22c5eae034b8b61ed784208 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -502,16 +502,12 @@ gpu_init(const gmx_device_info_t   *deviceInfo,
  
      cuda_init_const(nb, ic, listParams, nbat->params());
  
-    nb->natoms                   = 0;
-    nb->natoms_alloc             = 0;
      nb->atomIndicesSize          = 0;
      nb->atomIndicesSize_alloc    = 0;
      nb->ncxy_na                  = 0;
      nb->ncxy_na_alloc            = 0;
      nb->ncxy_ind                 = 0;
      nb->ncxy_ind_alloc           = 0;
-    nb->nfrvec                   = 0;
-    nb->nfrvec_alloc             = 0;
      nb->ncell                    = 0;
      nb->ncell_alloc              = 0;
  
@@ -903,10 +899,7 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet            &gridSet,
          const int           atomIndicesSize   = gridSet.atomIndices().size();
          const int          *cxy_na            = grid.cxy_na().data();
          const int          *cxy_ind           = grid.cxy_ind().data();
-        // TODO Should be done once per gridset
-        const int           numRealAtomsTotal = gridSet.numRealAtomsTotal();
  
-        reallocateDeviceBuffer(&gpu_nbv->xrvec, numRealAtomsTotal, &gpu_nbv->natoms, &gpu_nbv->natoms_alloc, nullptr);
          reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize, &gpu_nbv->atomIndicesSize_alloc, nullptr);
  
          if (atomIndicesSize > 0)
@@ -977,8 +970,6 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int                *cell,
  
      cudaStream_t         stream  = gpu_nbv->stream[InteractionLocality::Local];
  
-    reallocateDeviceBuffer(&gpu_nbv->frvec, natoms_total, &gpu_nbv->nfrvec, &gpu_nbv->nfrvec_alloc, nullptr);
-
      if (natoms_total > 0)
      {
          reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, nullptr);
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h

index ec4b2b8d339592d85b5bd7e802dca928f001d1bb..f3fc0e88527c6626c0a4021e9b03898f47da8d61 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -219,18 +219,6 @@ struct gmx_nbnxn_cuda_t
      bool                                                            bUseTwoStreams;
      //! atom data
      cu_atomdata_t                                                  *atdat;
-    //! coordinates in rvec format
-    rvec                                                           *xrvec;
-    //! number of atoms
-    int                                                             natoms;
-    //! number of atoms allocated in device buffer
-    int                                                             natoms_alloc;
-    //! force in rvec format
-    rvec                                                           *frvec;
-    //! number of atoms in force buffer
-    int                                                             nfrvec;
-    //! number of atoms allocated in force buffer
-    int                                                             nfrvec_alloc;
      //! f buf ops cell index mapping
      int                                                            *cell;
      //! number of indices in cell buffer
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp

index 892098ae4ab10ac0b253f46ed7364ee9f3773780..5c6967bb65f909dad4dd5f2be85bfea907f36a96 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -147,37 +147,16 @@ void nonbonded_verlet_t::convertCoordinates(const Nbnxm::AtomLocality       loca
      wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
  }
  
-
-void nonbonded_verlet_t::copyCoordinatesToGpu(const Nbnxm::AtomLocality       locality,
-                                              const bool                      fillLocal,
-                                              gmx::ArrayRef<const gmx::RVec>  coordinatesHost)
-{
-    wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
-    wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
-
-    nbnxn_atomdata_copy_x_to_gpu(pairSearch_->gridSet(), locality, fillLocal,
-                                 nbat.get(), gpu_nbv,
-                                 as_rvec_array(coordinatesHost.data()));
-
-    wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
-    wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
-}
-
-DeviceBuffer<float> nonbonded_verlet_t::getDeviceCoordinates()
-{
-    return nbnxn_atomdata_get_x_gpu(gpu_nbv);
-}
-
  void nonbonded_verlet_t::convertCoordinatesGpu(const Nbnxm::AtomLocality       locality,
                                                 const bool                      fillLocal,
-                                               DeviceBuffer<float>             coordinatesDevice)
+                                               DeviceBuffer<float>             d_x)
  {
      wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
      wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
  
      nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, fillLocal,
                                     gpu_nbv,
-                                   coordinatesDevice);
+                                   d_x);
  
      wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
      wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
@@ -254,11 +233,6 @@ nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu()
      wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
  }
  
-DeviceBuffer<float> nonbonded_verlet_t::getDeviceForces()
-{
-    return nbnxn_atomdata_get_f_gpu(gpu_nbv);
-}
-
  real nonbonded_verlet_t::pairlistInnerRadius() const
  {
      return pairlistSets_->params().rlistInner;
@@ -286,40 +260,11 @@ void nonbonded_verlet_t::insertNonlocalGpuDependency(const Nbnxm::InteractionLoc
      Nbnxm::nbnxnInsertNonlocalGpuDependency(gpu_nbv, interactionLocality);
  }
  
-void nonbonded_verlet_t::launch_copy_f_to_gpu(rvec *f, const Nbnxm::AtomLocality locality)
-{
-    nbnxn_launch_copy_f_to_gpu(locality,
-                               pairSearch_->gridSet(),
-                               gpu_nbv,
-                               f);
-}
-
-void nonbonded_verlet_t::launch_copy_f_from_gpu(rvec *f, const Nbnxm::AtomLocality locality)
-{
-    nbnxn_launch_copy_f_from_gpu(locality,
-                                 pairSearch_->gridSet(),
-                                 gpu_nbv,
-                                 f);
-}
-
-void nonbonded_verlet_t::launch_copy_x_from_gpu(rvec *x, const Nbnxm::AtomLocality locality)
-{
-    nbnxn_launch_copy_x_from_gpu(locality,
-                                 pairSearch_->gridSet(),
-                                 gpu_nbv,
-                                 x);
-}
-
  void nonbonded_verlet_t::wait_for_gpu_force_reduction(const Nbnxm::AtomLocality locality)
  {
      nbnxn_wait_for_gpu_force_reduction(locality, gpu_nbv);
  }
  
-void* nonbonded_verlet_t::get_gpu_xrvec()
-{
-    return Nbnxm::nbnxn_get_gpu_xrvec(gpu_nbv);
-}
-
  void* nonbonded_verlet_t::get_x_on_device_event()
  {
      return Nbnxm::nbnxn_get_x_on_device_event(gpu_nbv);
@@ -330,11 +275,6 @@ void nonbonded_verlet_t::wait_nonlocal_x_copy_D2H_done()
      Nbnxm::nbnxn_wait_nonlocal_x_copy_D2H_done(gpu_nbv);
  }
  
-void* nonbonded_verlet_t::get_gpu_frvec()
-{
-    return Nbnxm::nbnxn_get_gpu_frvec(gpu_nbv);
-}
-
  void nonbonded_verlet_t::stream_local_wait_for_nonlocal()
  {
      Nbnxm::nbnxn_stream_local_wait_for_nonlocal(gpu_nbv);
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index c6d12d2e87231a6d4f14c7b19e676b22ce371a5f..991f21692e9f0ad6665bd633d18d6030ee2688a5 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -264,42 +264,17 @@ struct nonbonded_verlet_t
                                  bool                            fillLocal,
                                  gmx::ArrayRef<const gmx::RVec>  coordinates);
  
-        /*!\brief Copy coordinates to the GPU memory.
-         *
-         * This function uses the internal NBNXM GPU pointer to copy coordinates in the plain rvec format
-         * into the GPU memory.
-         *
-         * \todo This function will be removed in future patches as the management of the device buffers
-         *       is moved to a separate object.
-         *
-         * \param[in] locality         Whether coordinates for local or non-local atoms should be transformed.
-         * \param[in] fillLocal        If the coordinates for filler particles should be zeroed.
-         * \param[in] coordinatesHost  Coordinates in plain rvec format to be transformed.
-         */
-        void copyCoordinatesToGpu(Nbnxm::AtomLocality             locality,
-                                  bool                            fillLocal,
-                                  gmx::ArrayRef<const gmx::RVec>  coordinatesHost);
-
-        /*!\brief Getter for the GPU coordinates buffer.
-         *
-         * \todo This function will be removed in future patches as the management of the device buffers
-         *       is moved to a separate object.
-         *
-         * \returns The coordinates buffer in plain rvec format.
-         */
-        DeviceBuffer<float> getDeviceCoordinates();
-
          /*!\brief Convert the coordinates to NBNXM format on the GPU for the given locality
           *
           * The API function for the transformation of the coordinates from one layout to another in the GPU memory.
           *
-         * \param[in] locality           Whether coordinates for local or non-local atoms should be transformed.
-         * \param[in] fillLocal          If the coordinates for filler particles should be zeroed.
-         * \param[in] coordinatesDevice  GPU coordinates buffer in plain rvec format to be transformed.
+         * \param[in] locality   Whether coordinates for local or non-local atoms should be transformed.
+         * \param[in] fillLocal  If the coordinates for filler particles should be zeroed.
+         * \param[in] d_x        GPU coordinates buffer in plain rvec format to be transformed.
           */
          void convertCoordinatesGpu(Nbnxm::AtomLocality             locality,
                                     bool                            fillLocal,
-                                   DeviceBuffer<float>             coordinatesDevice);
+                                   DeviceBuffer<float>             d_x);
  
          //! Init for GPU version of setup coordinates in Nbnxm
          void atomdata_init_copy_x_to_nbat_x_gpu();
@@ -370,36 +345,15 @@ struct nonbonded_verlet_t
                                            bool                                useGpuFPmeReduction,
                                            bool                                accumulateForce);
  
-        /*!\brief Getter for the GPU force buffer.
-         *
-         * \todo This function will be removed in future patches as the management of the device buffers
-         *       is moved to a separate object.
-         *
-         * \returns The force buffer in plain rvec format.
-         */
-        DeviceBuffer<float> getDeviceForces();
-
          /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
          void atomdata_init_add_nbat_f_to_f_gpu();
  
-        /*! \brief H2D transfer of force buffer*/
-        void launch_copy_f_to_gpu(rvec *f, Nbnxm::AtomLocality locality);
-
-        /*! \brief D2H transfer of force buffer*/
-        void launch_copy_f_from_gpu(rvec *f, Nbnxm::AtomLocality locality);
-
-        /*! \brief D2H transfer of coordinate buffer*/
-        void launch_copy_x_from_gpu(rvec *f, Nbnxm::AtomLocality locality);
-
          /*! \brief Wait for GPU force reduction task and D2H transfer of its results to complete
           *
           * FIXME: need more details: when should be called / after which operation, etc.
           */
          void wait_for_gpu_force_reduction(Nbnxm::AtomLocality locality);
  
-        /*! \brief return GPU pointer to x in rvec format */
-        void* get_gpu_xrvec();
-
          /*! \brief return pointer to GPU event recorded when coordinates have been copied to device */
          void* get_x_on_device_event();
  
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index f30028180270048a24667127a8387a7b2deaff1b..f661a19ca1a4ea4759c2b295435677f7d5fd20ef 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -229,48 +229,21 @@ CUDA_FUNC_QUALIFIER
  void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet gmx_unused &gridSet,
                                  gmx_nbnxn_gpu_t    gmx_unused *gpu_nbv) CUDA_FUNC_TERM;
  
-/*! \brief Copy coordinates from host to device memory.
- *
- * \todo This will be removed as the management of the buffers is taken out of the NBNXM module.
+/*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
   *
- * \param[in]     grid             Grid to be copied.
+ * \param[in]     grid             Grid to be converted.
+ * \param[in]     setFillerCoords  If the filler coordinates are used.
   * \param[in,out] gpu_nbv          The nonbonded data GPU structure.
+ * \param[in]     d_x              Device-side coordinates in plain rvec format.
   * \param[in]     locality         Copy coordinates for local or non-local atoms.
- * \param[in]     coordinatesHost  Host-side coordinates in plain rvec format.
- */
-CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_copy_x_to_gpu(const Nbnxm::Grid   gmx_unused &grid,
-                             gmx_nbnxn_gpu_t     gmx_unused *gpu_nbv,
-                             Nbnxm::AtomLocality gmx_unused  locality,
-                             const rvec          gmx_unused *coordinatesHost) CUDA_FUNC_TERM;
-
-/*! \brief Getter for the device coordinates buffer.
- *
- * \todo This will be removed as the management of the buffers is taken out of the NBNXM module.
- *
- * \param[in]  gpu_nbv  The nonbonded data GPU structure.
- *
- * \returns Device coordinates buffer in plain rvec format.
- */
-CUDA_FUNC_QUALIFIER
-DeviceBuffer<float> nbnxn_gpu_get_x_gpu(gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer<float> {});
-
-
-/*! \brief X buffer operations on GPU: performs conversion from rvec to nb format.
- *
- * \param[in]     grid               Grid to be converted.
- * \param[in]     setFillerCoords    If the filler coordinates are used.
- * \param[in,out] gpu_nbv            The nonbonded data GPU structure.
- * \param[in]     coordinatesDevice  Device-side coordinates in plain rvec format.
- * \param[in]     locality           Copy coordinates for local or non-local atoms.
- * \param[in]     gridId             Index of the grid being converted.
- * \param[in]     numColumnsMax      Maximum number of columns in the grid.
+ * \param[in]     gridId           Index of the grid being converted.
+ * \param[in]     numColumnsMax    Maximum number of columns in the grid.
   */
  CUDA_FUNC_QUALIFIER
  void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid   gmx_unused &grid,
                             bool                gmx_unused  setFillerCoords,
                             gmx_nbnxn_gpu_t     gmx_unused *gpu_nbv,
-                           DeviceBuffer<float> gmx_unused  coordinatesDevice,
+                           DeviceBuffer<float> gmx_unused  d_x,
                             Nbnxm::AtomLocality gmx_unused  locality,
                             int                 gmx_unused  gridId,
                             int                 gmx_unused  numColumnsMax) CUDA_FUNC_TERM;
@@ -346,43 +319,6 @@ void nbnxn_gpu_add_nbat_f_to_f(AtomLocality                 gmx_unused  atomLoca
                                 bool                         gmx_unused  useGpuFPmeReduction,
                                 bool                         gmx_unused  accumulateForce) CUDA_FUNC_TERM;
  
-/*! \brief Getter for the device coordinates buffer.
- *
- * \todo This will be removed as the management of the buffers is taken out of the NBNXM module.
- *
- * \param[in]  gpu_nbv  The nonbonded data GPU structure.
- *
- * \returns Device coordinates buffer in plain rvec format.
- */
-CUDA_FUNC_QUALIFIER
-DeviceBuffer<float> nbnxn_gpu_get_f_gpu(gmx_nbnxn_gpu_t gmx_unused *gpu_nbv) CUDA_FUNC_TERM_WITH_RETURN(DeviceBuffer<float> {});
-
-/*! \brief Copy force buffer from CPU to GPU */
-CUDA_FUNC_QUALIFIER
-void nbnxn_launch_copy_f_to_gpu(AtomLocality            gmx_unused  atomLocality,
-                                const Nbnxm::GridSet    gmx_unused &gridSet,
-                                gmx_nbnxn_gpu_t         gmx_unused *nb,
-                                rvec                    gmx_unused *f) CUDA_FUNC_TERM;
-
-/*! \brief Copy force buffer from GPU to CPU */
-CUDA_FUNC_QUALIFIER
-void nbnxn_launch_copy_f_from_gpu(AtomLocality            gmx_unused  atomLocality,
-                                  const Nbnxm::GridSet    gmx_unused &gridSet,
-                                  gmx_nbnxn_gpu_t         gmx_unused *nb,
-                                  rvec                    gmx_unused *f) CUDA_FUNC_TERM;
-
-/*! \brief Asynchronous launch of copying coordinate buffer from GPU to CPU
- * \param[in]  atomLocality  Locality for data trasnfer
- * \param[in]  gridSet       The Grid Set data object
- * \param[in]  nb            The nonbonded data GPU structure
- * \param[out] x             Coordinate buffer on CPU
- */
-CUDA_FUNC_QUALIFIER
-void nbnxn_launch_copy_x_from_gpu(AtomLocality            gmx_unused  atomLocality,
-                                  const Nbnxm::GridSet    gmx_unused &gridSet,
-                                  gmx_nbnxn_gpu_t         gmx_unused *nb,
-                                  rvec                    gmx_unused *x) CUDA_FUNC_TERM;
-
  /*! \brief Wait for GPU stream to complete */
  CUDA_FUNC_QUALIFIER
  void nbnxn_wait_for_gpu_force_reduction(AtomLocality            gmx_unused  atomLocality,
@@ -400,24 +336,12 @@ void nbnxn_wait_x_on_device(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM;
  CUDA_FUNC_QUALIFIER
  void* nbnxn_get_x_on_device_event(const gmx_nbnxn_gpu_t gmx_unused    *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
  
-/*! \brief return GPU pointer to x in rvec format
- * \param[in] nb                   The nonbonded data GPU structure
- */
-CUDA_FUNC_QUALIFIER
-void* nbnxn_get_gpu_xrvec(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
-
  /*! \brief Wait for non-local copy of coordinate buffer from device to host
   * \param[in] nb                   The nonbonded data GPU structure
   */
  CUDA_FUNC_QUALIFIER
  void nbnxn_wait_nonlocal_x_copy_D2H_done(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM;
  
-/*! \brief return GPU pointer to f in rvec format
- * \param[in] nb                   The nonbonded data GPU structure
- */
-CUDA_FUNC_QUALIFIER
-void* nbnxn_get_gpu_frvec(gmx_nbnxn_gpu_t     gmx_unused *nb) CUDA_FUNC_TERM_WITH_RETURN(nullptr);
-
  /*! \brief Ensure local stream waits for non-local stream
   * \param[in] nb                   The nonbonded data GPU structure
   */
author	Artem Zhmurov <zhmurov@gmail.com>
	Tue, 3 Sep 2019 12:23:40 +0000 (14:23 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Tue, 1 Oct 2019 16:22:17 +0000 (18:22 +0200)
src/gromacs/domdec/domdec.cpp		patch \| blob \| history
src/gromacs/domdec/domdec.h		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange.h		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cpp		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.h		patch \| blob \| history
src/gromacs/ewald/pme_only.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmegathertest.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmesplinespreadtest.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmetestcommon.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmetestcommon.h		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdlib/update_constrain_cuda.h		patch \| blob \| history
src/gromacs/mdlib/update_constrain_cuda_impl.cpp		patch \| blob \| history
src/gromacs/mdlib/update_constrain_cuda_impl.cu		patch \| blob \| history
src/gromacs/mdlib/update_constrain_cuda_impl.h		patch \| blob \| history
src/gromacs/mdrun/md.cpp		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history
src/gromacs/mdtypes/CMakeLists.txt		patch \| blob \| history
src/gromacs/mdtypes/forcerec.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu.h	[new file with mode: 0644]	patch \| blob
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h	[new file with mode: 0644]	patch \| blob
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/nbnxm/atomdata.cpp		patch \| blob \| history
src/gromacs/nbnxm/atomdata.h		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.cpp		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history