Make use of the DeviceStreamManager

author Artem Zhmurov <zhmurov@gmail.com>

Mon, 24 Feb 2020 09:22:40 +0000 (10:22 +0100)

committer Artem Zhmurov <zhmurov@gmail.com>

Wed, 25 Mar 2020 06:47:15 +0000 (07:47 +0100)
author Artem Zhmurov <zhmurov@gmail.com>
Mon, 24 Feb 2020 09:22:40 +0000 (10:22 +0100)
committer Artem Zhmurov <zhmurov@gmail.com>
Wed, 25 Mar 2020 06:47:15 +0000 (07:47 +0100)
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp

index ebcc92bf2ea1949d794860cc22f0db393771fefb..aed2af87cbe5522259c31d9311df501fa50cbae0 100644 (file)
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -64,6 +64,7 @@
  #include "gromacs/domdec/partition.h"
  #include "gromacs/gmxlib/network.h"
  #include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/hardware/hw_info.h"
  #include "gromacs/listed_forces/manage_threading.h"
@@ -3200,13 +3201,16 @@ gmx_bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const g
      return bCutoffAllowed;
  }
  
-void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
-                              const t_commrec&     cr,
-                              const DeviceContext& deviceContext,
-                              const DeviceStream&  streamLocal,
-                              const DeviceStream&  streamNonLocal)
+void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
+                              const t_commrec&                cr,
+                              const gmx::DeviceStreamManager& deviceStreamManager)
  {
-
+    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
+                       "Local non-bonded stream should be valid when using"
+                       "GPU halo exchange.");
+    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
+                       "Non-local non-bonded stream should be valid when using "
+                       "GPU halo exchange.");
      int gpuHaloExchangeSize = 0;
      int pulseStart          = 0;
      if (cr.dd->gpuHaloExchange.empty())
@@ -3228,7 +3232,9 @@ void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
          for (int pulse = pulseStart; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
          {
              cr.dd->gpuHaloExchange.push_back(std::make_unique<gmx::GpuHaloExchange>(
-                    cr.dd, cr.mpi_comm_mysim, deviceContext, streamLocal, streamNonLocal, pulse));
+                    cr.dd, cr.mpi_comm_mysim, deviceStreamManager.context(),
+                    deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
+                    deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal), pulse));
          }
      }
  }
diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h

index da617607a64a1acf500d3c83338e4da823beced7..82cefa18c1eafcd85dab35aae4f4fda876e61dfc 100644 (file)
--- a/src/gromacs/domdec/domdec.h
+++ b/src/gromacs/domdec/domdec.h
@@ -84,13 +84,13 @@ struct t_mdatoms;
  struct t_nrnb;
  struct gmx_wallcycle;
  enum class PbcType : int;
-class DeviceStream;
  class t_state;
  class DeviceContext;
  class GpuEventSynchronizer;
  
  namespace gmx
  {
+class DeviceStreamManager;
  class ForceWithShiftForces;
  class MDLogger;
  class RangePartitioning;
@@ -313,18 +313,15 @@ void dd_bonded_cg_distance(const gmx::MDLogger& mdlog,
                             real*                r_2b,
                             real*                r_mb);
  
-/*! \brief Construct the GPU halo exchange object(s)
- * \param[in] mdlog          The logger object
- * \param[in] cr             The commrec object
- * \param[in] deviceContext  GPU device context
- * \param[in] streamLocal    The local GPU stream
- * \param[in] streamNonLocal The non-local GPU stream
+/*! \brief Construct the GPU halo exchange object(s).
+ *
+ * \param[in] mdlog               The logger object.
+ * \param[in] cr                  The commrec object.
+ * \param[in] deviceStreamManager Manager of the GPU context and streams.
   */
-void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
-                              const t_commrec&     cr,
-                              const DeviceContext& deviceContext,
-                              const DeviceStream&  streamLocal,
-                              const DeviceStream&  streamNonLocal);
+void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
+                              const t_commrec&                cr,
+                              const gmx::DeviceStreamManager& deviceStreamManager);
  
  /*! \brief
   * (Re-) Initialization for GPU halo exchange
diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp

index 8d8bb673c22500f46c72600ed2ecfaa7ac5f887e..120887e8bbea3567ee059aea4ccb66fef4e966a2 100644 (file)
--- a/src/gromacs/ewald/pme.cpp
+++ b/src/gromacs/ewald/pme.cpp
@@ -560,19 +560,20 @@ static int div_round_up(int enumerator, int denominator)
      return (enumerator + denominator - 1) / denominator;
  }
  
-gmx_pme_t* gmx_pme_init(const t_commrec*         cr,
-                        const NumPmeDomains&     numPmeDomains,
-                        const t_inputrec*        ir,
-                        gmx_bool                 bFreeEnergy_q,
-                        gmx_bool                 bFreeEnergy_lj,
-                        gmx_bool                 bReproducible,
-                        real                     ewaldcoeff_q,
-                        real                     ewaldcoeff_lj,
-                        int                      nthread,
-                        PmeRunMode               runMode,
-                        PmeGpu*                  pmeGpu,
-                        const DeviceInformation* deviceInfo,
-                        const PmeGpuProgram*     pmeGpuProgram,
+gmx_pme_t* gmx_pme_init(const t_commrec*     cr,
+                        const NumPmeDomains& numPmeDomains,
+                        const t_inputrec*    ir,
+                        gmx_bool             bFreeEnergy_q,
+                        gmx_bool             bFreeEnergy_lj,
+                        gmx_bool             bReproducible,
+                        real                 ewaldcoeff_q,
+                        real                 ewaldcoeff_lj,
+                        int                  nthread,
+                        PmeRunMode           runMode,
+                        PmeGpu*              pmeGpu,
+                        const DeviceContext* deviceContext,
+                        const DeviceStream*  deviceStream,
+                        const PmeGpuProgram* pmeGpuProgram,
                          const gmx::MDLogger& /*mdlog*/)
  {
      int  use_threads, sum_use_threads, i;
@@ -883,8 +884,13 @@ gmx_pme_t* gmx_pme_init(const t_commrec*         cr,
          {
              GMX_THROW(gmx::NotImplementedError(errorString));
          }
+        pme_gpu_reinit(pme.get(), deviceContext, deviceStream, pmeGpuProgram);
      }
-    pme_gpu_reinit(pme.get(), deviceInfo, pmeGpuProgram);
+    else
+    {
+        GMX_ASSERT(pme->gpu == nullptr, "Should not have PME GPU object when PME is on a CPU.");
+    }
+
  
      pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
  
@@ -925,7 +931,7 @@ void gmx_pme_reinit(struct gmx_pme_t** pmedata,
          NumPmeDomains numPmeDomains = { pme_src->nnodes_major, pme_src->nnodes_minor };
          *pmedata = gmx_pme_init(cr, numPmeDomains, &irc, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE,
                                  ewaldcoeff_q, ewaldcoeff_lj, pme_src->nthread, pme_src->runMode,
-                                pme_src->gpu, nullptr, nullptr, dummyLogger);
+                                pme_src->gpu, nullptr, nullptr, nullptr, dummyLogger);
          /* When running PME on the CPU not using domain decomposition,
           * the atom data is allocated once only in gmx_pme_(re)init().
           */
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h

index bfc79b88e99bfa5ef037fdf64e519b6da385f539..59aa7c604d5eb8e26d801326dde106bc55eda910 100644 (file)
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -64,7 +64,6 @@ struct t_inputrec;
  struct t_nrnb;
  struct PmeGpu;
  struct gmx_wallclock_gpu_pme_t;
-struct DeviceInformation;
  struct gmx_enerdata_t;
  struct gmx_mtop_t;
  struct gmx_pme_t;
@@ -137,20 +136,21 @@ bool gmx_pme_check_restrictions(int  pme_order,
   * related things whose lifetime can/should exceed that of a task (or
   * perhaps task manager). See Redmine #2522.
   */
-gmx_pme_t* gmx_pme_init(const t_commrec*         cr,
-                        const NumPmeDomains&     numPmeDomains,
-                        const t_inputrec*        ir,
-                        gmx_bool                 bFreeEnergy_q,
-                        gmx_bool                 bFreeEnergy_lj,
-                        gmx_bool                 bReproducible,
-                        real                     ewaldcoeff_q,
-                        real                     ewaldcoeff_lj,
-                        int                      nthread,
-                        PmeRunMode               runMode,
-                        PmeGpu*                  pmeGpu,
-                        const DeviceInformation* deviceInfo,
-                        const PmeGpuProgram*     pmeGpuProgram,
-                        const gmx::MDLogger&     mdlog);
+gmx_pme_t* gmx_pme_init(const t_commrec*     cr,
+                        const NumPmeDomains& numPmeDomains,
+                        const t_inputrec*    ir,
+                        gmx_bool             bFreeEnergy_q,
+                        gmx_bool             bFreeEnergy_lj,
+                        gmx_bool             bReproducible,
+                        real                 ewaldcoeff_q,
+                        real                 ewaldcoeff_lj,
+                        int                  nthread,
+                        PmeRunMode           runMode,
+                        PmeGpu*              pmeGpu,
+                        const DeviceContext* deviceContext,
+                        const DeviceStream*  deviceStream,
+                        const PmeGpuProgram* pmeGpuProgram,
+                        const gmx::MDLogger& mdlog);
  
  /*! \brief As gmx_pme_init, but takes most settings, except the grid/Ewald coefficients, from
   * pme_src. This is only called when the PME cut-off/grid size changes.
@@ -433,13 +433,6 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t*        GPU_FUNC_AR
  GPU_FUNC_QUALIFIER void* pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
          GPU_FUNC_TERM_WITH_RETURN(nullptr);
  
-/*! \brief Returns the pointer to the GPU stream.
- *  \param[in] pme            The PME data structure.
- *  \returns                  Pointer to GPU stream object.
- */
-GPU_FUNC_QUALIFIER const DeviceStream* pme_gpu_get_device_stream(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
-        GPU_FUNC_TERM_WITH_RETURN(nullptr);
-
  /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
   * \param[in] pme            The PME data structure.
   * \returns                  Pointer to sychronizer
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp

index 91596e77d483be31bca315568ae91da3671ea087..a5f54f004d1472fd15d7d2e5b5c3f29eb199c352 100644 (file)
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -433,15 +433,6 @@ void pme_gpu_set_device_x(const gmx_pme_t* pme, DeviceBuffer<gmx::RVec> d_x)
      pme_gpu_set_kernelparam_coordinates(pme->gpu, d_x);
  }
  
-const DeviceStream* pme_gpu_get_device_stream(const gmx_pme_t* pme)
-{
-    if (!pme || !pme_gpu_active(pme))
-    {
-        return nullptr;
-    }
-    return pme_gpu_get_stream(pme->gpu);
-}
-
  GpuEventSynchronizer* pme_gpu_get_f_ready_synchronizer(const gmx_pme_t* pme)
  {
      if (!pme || !pme_gpu_active(pme))
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp

index c7a6df563a9bf317c6a26ff4b06a9705501606e1..62f7cc5c7e26a1d7fa29fe99f5b164ca712b1d30 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -56,6 +56,8 @@
  #include <string>
  
  #include "gromacs/ewald/ewald_utils.h"
+#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/math/invertmatrix.h"
  #include "gromacs/math/units.h"
@@ -462,16 +464,22 @@ void pme_gpu_sync_spread_grid(const PmeGpu* pmeGpu)
      pmeGpu->archSpecific->syncSpreadGridD2H.waitForEvent();
  }
  
-void pme_gpu_init_internal(PmeGpu* pmeGpu)
+/*! \brief Internal GPU initialization for PME.
+ *
+ * \param[in]  pmeGpu         GPU PME data.
+ * \param[in]  deviceContext  GPU context.
+ * \param[in]  deviceStream   GPU stream.
+ */
+static void pme_gpu_init_internal(PmeGpu* pmeGpu, const DeviceContext& deviceContext, const DeviceStream& deviceStream)
  {
  #if GMX_GPU == GMX_GPU_CUDA
      // Prepare to use the device that this PME task was assigned earlier.
      // Other entities, such as CUDA timing events, are known to implicitly use the device context.
-    CU_RET_ERR(cudaSetDevice(pmeGpu->deviceInfo->id), "Switching to PME CUDA device");
+    CU_RET_ERR(cudaSetDevice(deviceContext.deviceInfo().id), "Switching to PME CUDA device");
  #endif
  
      /* Allocate the target-specific structures */
-    pmeGpu->archSpecific.reset(new PmeGpuSpecific(pmeGpu->programHandle_->impl_->deviceContext_));
+    pmeGpu->archSpecific.reset(new PmeGpuSpecific(deviceContext, deviceStream));
      pmeGpu->kernelParams.reset(new PmeGpuKernelParams());
  
      pmeGpu->archSpecific->performOutOfPlaceFFT = true;
@@ -480,33 +488,12 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu)
       * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
       */
  
-    // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?) and reuse in NB
-    if (GMX_GPU == GMX_GPU_CUDA)
-    {
-        /* WARNING: CUDA timings are incorrect with multiple streams.
-         *          This is the main reason why they are disabled by default.
-         */
-        // TODO: Consider turning on by default when we can detect nr of streams.
-        pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
-    }
-    else if (GMX_GPU == GMX_GPU_OPENCL)
-    {
-        pmeGpu->archSpecific->useTiming = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
-    }
-
  #if GMX_GPU == GMX_GPU_CUDA
-    pmeGpu->maxGridWidthX = pmeGpu->deviceInfo->prop.maxGridSize[0];
+    pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0];
  #elif GMX_GPU == GMX_GPU_OPENCL
      pmeGpu->maxGridWidthX = INT32_MAX / 2;
      // TODO: is there no really global work size limit in OpenCL?
  #endif
-
-    /* Creating a PME GPU stream:
-     * - default high priority with CUDA
-     * - no priorities implemented yet with OpenCL; see #2532
-     */
-    pmeGpu->archSpecific->pmeStream_.init(pmeGpu->archSpecific->deviceContext_,
-                                          DeviceStreamPriority::High, pmeGpu->archSpecific->useTiming);
  }
  
  void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu)
@@ -721,13 +708,15 @@ static void pme_gpu_select_best_performing_pme_spreadgather_kernels(PmeGpu* pmeG
   * TODO: this should become PmeGpu::PmeGpu()
   *
   * \param[in,out] pme            The PME structure.
- * \param[in,out] deviceInfo     The GPU device information structure.
- * \param[in]     pmeGpuProgram  The handle to the program/kernel data created outside (e.g. in unit tests/runner)
+ * \param[in]     deviceContext  The GPU context.
+ * \param[in]     deviceStream   The GPU stream.
+ * \param[in,out] pmeGpuProgram  The handle to the program/kernel data created outside (e.g. in unit tests/runner)
   */
-static void pme_gpu_init(gmx_pme_t* pme, const DeviceInformation* deviceInfo, const PmeGpuProgram* pmeGpuProgram)
+static void pme_gpu_init(gmx_pme_t*           pme,
+                         const DeviceContext& deviceContext,
+                         const DeviceStream&  deviceStream,
+                         const PmeGpuProgram* pmeGpuProgram)
  {
-    GMX_ASSERT(deviceInfo != nullptr,
-               "Device information can not be nullptr when GPU is used for PME.");
      pme->gpu       = new PmeGpu();
      PmeGpu* pmeGpu = pme->gpu;
      changePinningPolicy(&pmeGpu->staging.h_forces, pme_get_pinning_policy());
@@ -743,13 +732,12 @@ static void pme_gpu_init(gmx_pme_t* pme, const DeviceInformation* deviceInfo, co
  
      pme_gpu_set_testing(pmeGpu, false);
  
-    pmeGpu->deviceInfo = deviceInfo;
      GMX_ASSERT(pmeGpuProgram != nullptr, "GPU kernels must be already compiled");
      pmeGpu->programHandle_ = pmeGpuProgram;
  
      pmeGpu->initializedClfftLibrary_ = std::make_unique<gmx::ClfftInitializer>();
  
-    pme_gpu_init_internal(pmeGpu);
+    pme_gpu_init_internal(pmeGpu, deviceContext, deviceStream);
      pme_gpu_alloc_energy_virial(pmeGpu);
  
      pme_gpu_copy_common_data_from(pme);
@@ -773,19 +761,21 @@ void pme_gpu_get_real_grid_sizes(const PmeGpu* pmeGpu, gmx::IVec* gridSize, gmx:
      }
  }
  
-void pme_gpu_reinit(gmx_pme_t* pme, const DeviceInformation* deviceInfo, const PmeGpuProgram* pmeGpuProgram)
+void pme_gpu_reinit(gmx_pme_t*           pme,
+                    const DeviceContext* deviceContext,
+                    const DeviceStream*  deviceStream,
+                    const PmeGpuProgram* pmeGpuProgram)
  {
      GMX_ASSERT(pme != nullptr, "Need valid PME object");
-    if (pme->runMode == PmeRunMode::CPU)
-    {
-        GMX_ASSERT(pme->gpu == nullptr, "Should not have PME GPU object");
-        return;
-    }
  
      if (!pme->gpu)
      {
+        GMX_RELEASE_ASSERT(deviceContext != nullptr,
+                           "Device context can not be nullptr when setting up PME on GPU.");
+        GMX_RELEASE_ASSERT(deviceStream != nullptr,
+                           "Device stream can not be nullptr when setting up PME on GPU.");
          /* First-time initialization */
-        pme_gpu_init(pme, deviceInfo, pmeGpuProgram);
+        pme_gpu_init(pme, *deviceContext, *deviceStream, pmeGpuProgram);
      }
      else
      {
@@ -1358,18 +1348,6 @@ void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<gmx:
      pmeGpu->kernelParams->atoms.d_coordinates = d_x;
  }
  
-const DeviceStream* pme_gpu_get_stream(const PmeGpu* pmeGpu)
-{
-    if (pmeGpu)
-    {
-        return &pmeGpu->archSpecific->pmeStream_;
-    }
-    else
-    {
-        return nullptr;
-    }
-}
-
  GpuEventSynchronizer* pme_gpu_get_forces_ready_synchronizer(const PmeGpu* pmeGpu)
  {
      if (pmeGpu && pmeGpu->kernelParams)
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h

index 3d764fd468a57448424199574a50a076d5aa60da..9a15c3bbc986f0b0141a84dc61db2ac1fd11f3cf 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -54,8 +54,10 @@
  #include "pme_gpu_types_host.h"
  #include "pme_output.h"
  
-class GpuEventSynchronizer;
+class DeviceContext;
  struct DeviceInformation;
+class DeviceStream;
+class GpuEventSynchronizer;
  struct gmx_hw_info_t;
  struct gmx_gpu_opt_t;
  struct gmx_pme_t; // only used in pme_gpu_reinit
@@ -71,7 +73,7 @@ struct t_complex;
  namespace gmx
  {
  class MDLogger;
-}
+} // namespace gmx
  
  //! Type of spline data
  enum class PmeSplineDataType
@@ -299,14 +301,6 @@ void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu);
   */
  void pme_gpu_sync_spread_grid(const PmeGpu* pmeGpu);
  
-/*! \libinternal \brief
- * Does the one-time GPU-framework specific PME initialization.
- * For CUDA, the PME stream is created with the highest priority.
- *
- * \param[in] pmeGpu  The PME GPU structure.
- */
-void pme_gpu_init_internal(PmeGpu* pmeGpu);
-
  /*! \libinternal \brief
   * Initializes the CUDA FFT structures.
   *
@@ -387,13 +381,6 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu* GPU_FU
  GPU_FUNC_QUALIFIER void* pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
          GPU_FUNC_TERM_WITH_RETURN(nullptr);
  
-/*! \brief Return pointer to GPU stream.
- * \param[in] pmeGpu         The PME GPU structure.
- * \returns                  Pointer to stream object.
- */
-GPU_FUNC_QUALIFIER const DeviceStream* pme_gpu_get_stream(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
-        GPU_FUNC_TERM_WITH_RETURN(nullptr);
-
  /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
   * \param[in] pmeGpu         The PME GPU structure.
   * \returns                  Pointer to sync object
@@ -498,13 +485,16 @@ GPU_FUNC_QUALIFIER void pme_gpu_get_real_grid_sizes(const PmeGpu* GPU_FUNC_ARGUM
  /*! \libinternal \brief
   * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
   *
- * \param[in,out] pme             The PME structure.
- * \param[in]     deviceInfo      The GPU device information structure.
- * \param[in]     pmeGpuProgram   The PME GPU program data
+ * \param[in,out] pme            The PME structure.
+ * \param[in]     deviceContext  The GPU context.
+ * \param[in]     deviceStream   The GPU stream.
+ * \param[in,out] pmeGpuProgram  The handle to the program/kernel data created outside (e.g. in unit tests/runner)
+ *
   * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
   */
-GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t*               GPU_FUNC_ARGUMENT(pme),
-                                       const DeviceInformation* GPU_FUNC_ARGUMENT(deviceInfo),
+GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t*           GPU_FUNC_ARGUMENT(pme),
+                                       const DeviceContext* GPU_FUNC_ARGUMENT(deviceContext),
+                                       const DeviceStream*  GPU_FUNC_ARGUMENT(deviceStream),
                                         const PmeGpuProgram* GPU_FUNC_ARGUMENT(pmeGpuProgram)) GPU_FUNC_TERM;
  
  /*! \libinternal \brief
diff --git a/src/gromacs/ewald/pme_gpu_types_host.h b/src/gromacs/ewald/pme_gpu_types_host.h

index 481c3a302d8151f53f3b613099ff5cfe8637bf52..9d7e2f78f9a842acd0f7b5c6bf5d8268ecf407fa 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_types_host.h
+++ b/src/gromacs/ewald/pme_gpu_types_host.h
@@ -162,9 +162,6 @@ struct PmeGpu
       */
      int nAtomsAlloc;
  
-    /*! \brief A pointer to the device used during the execution. */
-    const DeviceInformation* deviceInfo;
-
      /*! \brief Kernel scheduling grid width limit in X - derived from deviceinfo compute capability in CUDA.
       * Declared as very large int to make it useful in computations with type promotion, to avoid overflows.
       * OpenCL seems to not have readily available global work size limit, so we just assign a large arbitrary constant to this instead.
diff --git a/src/gromacs/ewald/pme_gpu_types_host_impl.h b/src/gromacs/ewald/pme_gpu_types_host_impl.h

index a019a7c0310b599ba6dda7c2e4fac04b65355f43..e134d2c0a79d306d4a6793c24c5ba6bfe2767b63 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_types_host_impl.h
+++ b/src/gromacs/ewald/pme_gpu_types_host_impl.h
@@ -71,9 +71,14 @@ struct PmeGpuSpecific
  {
      /*! \brief Constructor
       *
-     * \param[in] deviceContext GPU device context.
+     * \param[in] deviceContext  GPU device context
+     * \param[in] pmeStream      GPU pme stream.
       */
-    PmeGpuSpecific(const DeviceContext& deviceContext) : deviceContext_(deviceContext) {}
+    PmeGpuSpecific(const DeviceContext& deviceContext, const DeviceStream& pmeStream) :
+        deviceContext_(deviceContext),
+        pmeStream_(pmeStream)
+    {
+    }
  
      /*! \brief
       * A handle to the GPU context.
@@ -84,7 +89,7 @@ struct PmeGpuSpecific
      const DeviceContext& deviceContext_;
  
      /*! \brief The GPU stream where everything related to the PME happens. */
-    DeviceStream pmeStream_;
+    const DeviceStream& pmeStream_;
  
      /* Synchronization events */
      /*! \brief Triggered after the PME Force Calculations have been completed */
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp

index b85629b28af6ef4a630573a5b37466e4b1ac062f..26d824e3a2adfa66e18d36fb3e65b312b84ef92f 100644 (file)
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -82,6 +82,7 @@
  #include "gromacs/fileio/pdbio.h"
  #include "gromacs/gmxlib/network.h"
  #include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
  #include "gromacs/gpu_utils/hostallocator.h"
  #include "gromacs/math/gmxcomplex.h"
  #include "gromacs/math/units.h"
@@ -597,14 +598,14 @@ static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
  #endif
  }
  
-int gmx_pmeonly(struct gmx_pme_t*         pme,
-                const t_commrec*          cr,
-                t_nrnb*                   mynrnb,
-                gmx_wallcycle*            wcycle,
-                gmx_walltime_accounting_t walltime_accounting,
-                t_inputrec*               ir,
-                PmeRunMode                runMode,
-                const DeviceContext*      deviceContext)
+int gmx_pmeonly(struct gmx_pme_t*               pme,
+                const t_commrec*                cr,
+                t_nrnb*                         mynrnb,
+                gmx_wallcycle*                  wcycle,
+                gmx_walltime_accounting_t       walltime_accounting,
+                t_inputrec*                     ir,
+                PmeRunMode                      runMode,
+                const gmx::DeviceStreamManager* deviceStreamManager)
  {
      int     ret;
      int     natoms = 0;
@@ -629,25 +630,27 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
      const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed);
      if (useGpuForPme)
      {
-        const DeviceStream& deviceStream = *pme_gpu_get_device_stream(pme);
-
+        GMX_RELEASE_ASSERT(
+                deviceStreamManager != nullptr,
+                "Device stream manager can not be nullptr when using GPU in PME-only rank.");
+        GMX_RELEASE_ASSERT(deviceStreamManager->streamIsValid(gmx::DeviceStreamType::Pme),
+                           "Device stream can not be nullptr when using GPU in PME-only rank");
          changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
          changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
          if (c_enableGpuPmePpComms)
          {
              pme_pp->pmeCoordinateReceiverGpu = std::make_unique<gmx::PmeCoordinateReceiverGpu>(
-                    deviceStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
+                    deviceStreamManager->stream(gmx::DeviceStreamType::Pme), pme_pp->mpi_comm_mysim,
+                    pme_pp->ppRanks);
              pme_pp->pmeForceSenderGpu = std::make_unique<gmx::PmeForceSenderGpu>(
-                    deviceStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
+                    deviceStreamManager->stream(gmx::DeviceStreamType::Pme), pme_pp->mpi_comm_mysim,
+                    pme_pp->ppRanks);
          }
-        GMX_RELEASE_ASSERT(
-                deviceContext != nullptr,
-                "Device context can not be nullptr when building GPU propagator data object.");
          // TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here.
          //       This should be made safer.
-        stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(&deviceStream, *deviceContext,
-                                                                 GpuApiCallBehavior::Async,
-                                                                 pme_gpu_get_block_size(pme), wcycle);
+        stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
+                &deviceStreamManager->stream(gmx::DeviceStreamType::Pme), deviceStreamManager->context(),
+                GpuApiCallBehavior::Async, pme_gpu_get_block_size(pme), wcycle);
      }
  
      clear_nrnb(mynrnb);
diff --git a/src/gromacs/ewald/pme_only.h b/src/gromacs/ewald/pme_only.h

index 18edbb9b43ddf5bcdbe23127f58a2e79ece71027..1a71ea195c1b74ad44d168bd20753cfb0706e4d4 100644 (file)
--- a/src/gromacs/ewald/pme_only.h
+++ b/src/gromacs/ewald/pme_only.h
@@ -55,17 +55,20 @@ struct t_nrnb;
  struct gmx_pme_t;
  struct gmx_wallcycle;
  
-class DeviceContext;
  enum class PmeRunMode;
+namespace gmx
+{
+class DeviceStreamManager;
+}
  
  /*! \brief Called on the nodes that do PME exclusively */
-int gmx_pmeonly(gmx_pme_t*                pme,
-                const t_commrec*          cr,
-                t_nrnb*                   mynrnb,
-                gmx_wallcycle*            wcycle,
-                gmx_walltime_accounting_t walltime_accounting,
-                t_inputrec*               ir,
-                PmeRunMode                runMode,
-                const DeviceContext*      deviceContext);
+int gmx_pmeonly(gmx_pme_t*                      pme,
+                const t_commrec*                cr,
+                t_nrnb*                         mynrnb,
+                gmx_wallcycle*                  wcycle,
+                gmx_walltime_accounting_t       walltime_accounting,
+                t_inputrec*                     ir,
+                PmeRunMode                      runMode,
+                const gmx::DeviceStreamManager* deviceStreamManager);
  
  #endif
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu.h b/src/gromacs/ewald/pme_pp_comm_gpu.h

index ea750cc17c1052fb95a639dd2512da2d30b905e8..97accca871d2b27da7d8e28235871a3c9b8877e1 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu.h
@@ -46,11 +46,14 @@
  #include "gromacs/utility/gmxmpi.h"
  
  class DeviceContext;
+class DeviceStream;
  class GpuEventSynchronizer;
  
  namespace gmx
  {
  
+class DeviceStreamManager;
+
  /*! \libinternal
  
   * \brief Manages communication related to GPU buffers between this
@@ -63,8 +66,9 @@ public:
       * \param[in] comm            Communicator used for simulation
       * \param[in] pmeRank         Rank of PME task
       * \param[in] deviceContext   GPU context.
+     * \param[in] deviceStream    GPU stream.
       */
-    PmePpCommGpu(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext);
+    PmePpCommGpu(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext, const DeviceStream& deviceStream);
      ~PmePpCommGpu();
  
      /*! \brief Perform steps required when buffer size changes
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp

index b8befc5311033680740c329c99d588b823226383..b53ce94ada0b13236bd22d2c2ed8180162665f36 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
@@ -62,7 +62,10 @@ class PmePpCommGpu::Impl
  };
  
  /*!\brief Constructor stub. */
-PmePpCommGpu::PmePpCommGpu(MPI_Comm /* comm */, int /* pmeRank */, const DeviceContext& /* deviceContext */) :
+PmePpCommGpu::PmePpCommGpu(MPI_Comm /* comm */,
+                           int /* pmeRank */,
+                           const DeviceContext& /* deviceContext */,
+                           const DeviceStream& /* deviceStream */) :
      impl_(nullptr)
  {
      GMX_ASSERT(false,
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu

index 0e78978865bb4866fead00907c5d2272a5f0f80e..2c6f696ddd41a9d754c9ebab550db021f8d528c0 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -49,6 +49,7 @@
  
  #include "gromacs/gpu_utils/cudautils.cuh"
  #include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
  #include "gromacs/gpu_utils/devicebuffer.h"
  #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
  #include "gromacs/utility/gmxmpi.h"
@@ -56,18 +57,18 @@
  namespace gmx
  {
  
-PmePpCommGpu::Impl::Impl(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext) :
+PmePpCommGpu::Impl::Impl(MPI_Comm             comm,
+                         int                  pmeRank,
+                         const DeviceContext& deviceContext,
+                         const DeviceStream&  deviceStream) :
      deviceContext_(deviceContext),
+    pmePpCommStream_(deviceStream),
      comm_(comm),
      pmeRank_(pmeRank)
  {
      GMX_RELEASE_ASSERT(
              GMX_THREAD_MPI,
              "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
-
-    // In CUDA we only need priority to create stream.
-    // (note that this will be moved from here in the follow-up patch)
-    pmePpCommStream_.init(deviceContext, DeviceStreamPriority::Normal, false);
  }
  
  PmePpCommGpu::Impl::~Impl() = default;
@@ -158,8 +159,11 @@ void* PmePpCommGpu::Impl::getForcesReadySynchronizer()
      return static_cast<void*>(&forcesReadySynchronizer_);
  }
  
-PmePpCommGpu::PmePpCommGpu(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext) :
-    impl_(new Impl(comm, pmeRank, deviceContext))
+PmePpCommGpu::PmePpCommGpu(MPI_Comm             comm,
+                           int                  pmeRank,
+                           const DeviceContext& deviceContext,
+                           const DeviceStream&  deviceStream) :
+    impl_(new Impl(comm, pmeRank, deviceContext, deviceStream))
  {
  }
  
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h

index 934b7c40c62ed1a1daa3eb58f607cb494554d3dd..4c95d9bccdef21745a470389f0043376d8aaf428 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.h
@@ -57,11 +57,13 @@ class PmePpCommGpu::Impl
  
  public:
      /*! \brief Creates PME-PP GPU communication object.
+     *
       * \param[in] comm            Communicator used for simulation
       * \param[in] pmeRank         Rank of PME task
       * \param[in] deviceContext   GPU context.
+     * \param[in] deviceStream    GPU stream.
       */
-    Impl(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext);
+    Impl(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext, const DeviceStream& deviceStream);
      ~Impl();
  
      /*! \brief Perform steps required when buffer size changes
@@ -116,10 +118,10 @@ public:
      void* getForcesReadySynchronizer();
  
  private:
-    //! Device context object
+    //! GPU context handle (not used in CUDA)
      const DeviceContext& deviceContext_;
-    //! CUDA stream used for the communication operations in this class
-    DeviceStream pmePpCommStream_;
+    //! Handle for CUDA stream used for the communication operations in this class
+    const DeviceStream& pmePpCommStream_;
      //! Remote location of PME coordinate data buffer
      void* remotePmeXBuffer_ = nullptr;
      //! Remote location of PME force data buffer
diff --git a/src/gromacs/ewald/tests/CMakeLists.txt b/src/gromacs/ewald/tests/CMakeLists.txt

index 6dea2b21ecca50c3a1ffc77a158c56729fa342d2..170bc96c120fee31877c8adb89504ee8660f9c7c 100644 (file)
--- a/src/gromacs/ewald/tests/CMakeLists.txt
+++ b/src/gromacs/ewald/tests/CMakeLists.txt
@@ -41,4 +41,13 @@ gmx_add_unit_test(EwaldUnitTests ewald-test HARDWARE_DETECTION
          testhardwarecontexts.cpp
      GPU_CPP_SOURCE_FILES
          pmetestcommon.cpp
-        )
+)
+
+gmx_add_libgromacs_sources(
+    testhardwarecontext.cpp
+)
+if (GMX_USE_CUDA)
+gmx_compile_cpp_as_cuda(
+    testhardwarecontext.cpp
+)
+endif()
diff --git a/src/gromacs/ewald/tests/pmegathertest.cpp b/src/gromacs/ewald/tests/pmegathertest.cpp

index 51035f035576a8a031396a88d7a8506583ef0272..eae6448323fe323e9d1eef384c727752258ffbd1 100644 (file)
--- a/src/gromacs/ewald/tests/pmegathertest.cpp
+++ b/src/gromacs/ewald/tests/pmegathertest.cpp
@@ -280,13 +280,13 @@ public:
          TestReferenceData refData;
          for (const auto& context : getPmeTestEnv()->getHardwareContexts())
          {
-            CodePath   codePath = context->getCodePath();
+            CodePath   codePath = context->codePath();
              const bool supportedInput =
                      pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
              if (!supportedInput)
              {
                  /* Testing the failure for the unsupported input */
-                EXPECT_THROW_GMX(pmeInitWrapper(&inputRec, codePath, nullptr, nullptr, box),
+                EXPECT_THROW_GMX(pmeInitWrapper(&inputRec, codePath, nullptr, nullptr, nullptr, box),
                                   NotImplementedError);
                  continue;
              }
@@ -295,14 +295,16 @@ public:
              SCOPED_TRACE(
                      formatString("Testing force gathering with %s %sfor PME grid size %d %d %d"
                                   ", order %d, %zu atoms",
-                                 codePathToString(codePath), context->getDescription().c_str(),
+                                 codePathToString(codePath), context->description().c_str(),
                                   gridSize[XX], gridSize[YY], gridSize[ZZ], pmeOrder, atomCount));
  
-            PmeSafePointer pmeSafe = pmeInitWrapper(&inputRec, codePath, context->getDeviceInfo(),
-                                                    context->getPmeGpuProgram(), box);
+            PmeSafePointer pmeSafe =
+                    pmeInitWrapper(&inputRec, codePath, context->deviceContext(),
+                                   context->deviceStream(), context->pmeGpuProgram(), box);
              std::unique_ptr<StatePropagatorDataGpu> stateGpu =
                      (codePath == CodePath::GPU)
-                            ? makeStatePropagatorDataGpu(*pmeSafe.get(), context->deviceContext())
+                            ? makeStatePropagatorDataGpu(*pmeSafe.get(), context->deviceContext(),
+                                                         context->deviceStream())
                              : nullptr;
  
              pmeInitAtoms(pmeSafe.get(), stateGpu.get(), codePath, inputAtomData.coordinates,
diff --git a/src/gromacs/ewald/tests/pmesolvetest.cpp b/src/gromacs/ewald/tests/pmesolvetest.cpp

index 86688822ebcc8a1f70b980fab2d039e30d32730f..07f31629f6317c136379dd135a59135675946af7 100644 (file)
--- a/src/gromacs/ewald/tests/pmesolvetest.cpp
+++ b/src/gromacs/ewald/tests/pmesolvetest.cpp
@@ -109,13 +109,13 @@ public:
          TestReferenceData refData;
          for (const auto& context : getPmeTestEnv()->getHardwareContexts())
          {
-            CodePath   codePath = context->getCodePath();
+            CodePath   codePath = context->codePath();
              const bool supportedInput =
                      pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
              if (!supportedInput)
              {
                  /* Testing the failure for the unsupported input */
-                EXPECT_THROW_GMX(pmeInitEmpty(&inputRec, codePath, nullptr, nullptr, box,
+                EXPECT_THROW_GMX(pmeInitEmpty(&inputRec, codePath, nullptr, nullptr, nullptr, box,
                                                ewaldCoeff_q, ewaldCoeff_lj),
                                   NotImplementedError);
                  continue;
@@ -137,13 +137,13 @@ public:
                              "size %d %d %d, Ewald coefficients %g %g",
                              (method == PmeSolveAlgorithm::LennardJones) ? "Lennard-Jones" : "Coulomb",
                              gridOrdering.second.c_str(), computeEnergyAndVirial ? "with" : "without",
-                            codePathToString(codePath), context->getDescription().c_str(),
+                            codePathToString(codePath), context->description().c_str(),
                              gridSize[XX], gridSize[YY], gridSize[ZZ], ewaldCoeff_q, ewaldCoeff_lj));
  
                      /* Running the test */
-                    PmeSafePointer pmeSafe =
-                            pmeInitEmpty(&inputRec, codePath, context->getDeviceInfo(),
-                                         context->getPmeGpuProgram(), box, ewaldCoeff_q, ewaldCoeff_lj);
+                    PmeSafePointer pmeSafe = pmeInitEmpty(
+                            &inputRec, codePath, context->deviceContext(), context->deviceStream(),
+                            context->pmeGpuProgram(), box, ewaldCoeff_q, ewaldCoeff_lj);
                      pmeSetComplexGrid(pmeSafe.get(), codePath, gridOrdering.first, nonZeroGridValues);
                      const real cellVolume = box[0] * box[4] * box[8];
                      // FIXME - this is box[XX][XX] * box[YY][YY] * box[ZZ][ZZ], should be stored in the PME structure
diff --git a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp

index 5c2d8663ef593f5b52c2973cc45fb8bb058389a2..ef975b2d084389d1a7fe7240bacb55227440710d 100644 (file)
--- a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
+++ b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
@@ -126,13 +126,13 @@ public:
  
          for (const auto& context : getPmeTestEnv()->getHardwareContexts())
          {
-            CodePath   codePath = context->getCodePath();
+            CodePath   codePath = context->codePath();
              const bool supportedInput =
                      pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
              if (!supportedInput)
              {
                  /* Testing the failure for the unsupported input */
-                EXPECT_THROW_GMX(pmeInitWrapper(&inputRec, codePath, nullptr, nullptr, box),
+                EXPECT_THROW_GMX(pmeInitWrapper(&inputRec, codePath, nullptr, nullptr, nullptr, box),
                                   NotImplementedError);
                  continue;
              }
@@ -141,20 +141,21 @@ public:
              {
                  /* Describing the test uniquely in case it fails */
  
-                SCOPED_TRACE(
-                        formatString("Testing %s with %s %sfor PME grid size %d %d %d"
-                                     ", order %d, %zu atoms",
-                                     option.second.c_str(), codePathToString(codePath),
-                                     context->getDescription().c_str(), gridSize[XX], gridSize[YY],
-                                     gridSize[ZZ], pmeOrder, atomCount));
+                SCOPED_TRACE(formatString(
+                        "Testing %s with %s %sfor PME grid size %d %d %d"
+                        ", order %d, %zu atoms",
+                        option.second.c_str(), codePathToString(codePath), context->description().c_str(),
+                        gridSize[XX], gridSize[YY], gridSize[ZZ], pmeOrder, atomCount));
  
                  /* Running the test */
  
-                PmeSafePointer pmeSafe = pmeInitWrapper(&inputRec, codePath, context->getDeviceInfo(),
-                                                        context->getPmeGpuProgram(), box);
+                PmeSafePointer pmeSafe =
+                        pmeInitWrapper(&inputRec, codePath, context->deviceContext(),
+                                       context->deviceStream(), context->pmeGpuProgram(), box);
                  std::unique_ptr<StatePropagatorDataGpu> stateGpu =
                          (codePath == CodePath::GPU)
-                                ? makeStatePropagatorDataGpu(*pmeSafe.get(), context->deviceContext())
+                                ? makeStatePropagatorDataGpu(*pmeSafe.get(), context->deviceContext(),
+                                                             context->deviceStream())
                                  : nullptr;
  
                  pmeInitAtoms(pmeSafe.get(), stateGpu.get(), codePath, coordinates, charges);
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp

index eaf697e1d5fd8d82ca7faaa39e8a32db4ab5ed67..888cac58734ead34abf88a1606470f1900d0c893 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -59,6 +59,7 @@
  #include "gromacs/ewald/pme_solve.h"
  #include "gromacs/ewald/pme_spread.h"
  #include "gromacs/fft/parallel_3dfft.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/math/invertmatrix.h"
  #include "gromacs/mdtypes/commrec.h"
@@ -106,21 +107,22 @@ uint64_t getSplineModuliDoublePrecisionUlps(int splineOrder)
  }
  
  //! PME initialization
-PmeSafePointer pmeInitWrapper(const t_inputrec*        inputRec,
-                              const CodePath           mode,
-                              const DeviceInformation* deviceInfo,
-                              const PmeGpuProgram*     pmeGpuProgram,
-                              const Matrix3x3&         box,
-                              const real               ewaldCoeff_q,
-                              const real               ewaldCoeff_lj)
+PmeSafePointer pmeInitWrapper(const t_inputrec*    inputRec,
+                              const CodePath       mode,
+                              const DeviceContext* deviceContext,
+                              const DeviceStream*  deviceStream,
+                              const PmeGpuProgram* pmeGpuProgram,
+                              const Matrix3x3&     box,
+                              const real           ewaldCoeff_q,
+                              const real           ewaldCoeff_lj)
  {
      const MDLogger dummyLogger;
      const auto     runMode       = (mode == CodePath::CPU) ? PmeRunMode::CPU : PmeRunMode::Mixed;
      t_commrec      dummyCommrec  = { 0 };
      NumPmeDomains  numPmeDomains = { 1, 1 };
-    gmx_pme_t*     pmeDataRaw =
-            gmx_pme_init(&dummyCommrec, numPmeDomains, inputRec, false, false, true, ewaldCoeff_q,
-                         ewaldCoeff_lj, 1, runMode, nullptr, deviceInfo, pmeGpuProgram, dummyLogger);
+    gmx_pme_t* pmeDataRaw = gmx_pme_init(&dummyCommrec, numPmeDomains, inputRec, false, false, true,
+                                         ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr,
+                                         deviceContext, deviceStream, pmeGpuProgram, dummyLogger);
      PmeSafePointer pme(pmeDataRaw); // taking ownership
  
      // TODO get rid of this with proper matrix type
@@ -151,33 +153,35 @@ PmeSafePointer pmeInitWrapper(const t_inputrec*        inputRec,
  }
  
  //! Simple PME initialization based on input, no atom data
-PmeSafePointer pmeInitEmpty(const t_inputrec*        inputRec,
-                            const CodePath           mode,
-                            const DeviceInformation* deviceInfo,
-                            const PmeGpuProgram*     pmeGpuProgram,
-                            const Matrix3x3&         box,
-                            const real               ewaldCoeff_q,
-                            const real               ewaldCoeff_lj)
-{
-    return pmeInitWrapper(inputRec, mode, deviceInfo, pmeGpuProgram, box, ewaldCoeff_q, ewaldCoeff_lj);
+PmeSafePointer pmeInitEmpty(const t_inputrec*    inputRec,
+                            const CodePath       mode,
+                            const DeviceContext* deviceContext,
+                            const DeviceStream*  deviceStream,
+                            const PmeGpuProgram* pmeGpuProgram,
+                            const Matrix3x3&     box,
+                            const real           ewaldCoeff_q,
+                            const real           ewaldCoeff_lj)
+{
+    return pmeInitWrapper(inputRec, mode, deviceContext, deviceStream, pmeGpuProgram, box,
+                          ewaldCoeff_q, ewaldCoeff_lj);
      // hiding the fact that PME actually needs to know the number of atoms in advance
  }
  
  PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec)
  {
      const Matrix3x3 defaultBox = { { 1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F } };
-    return pmeInitWrapper(inputRec, CodePath::CPU, nullptr, nullptr, defaultBox, 0.0F, 0.0F);
+    return pmeInitWrapper(inputRec, CodePath::CPU, nullptr, nullptr, nullptr, defaultBox, 0.0F, 0.0F);
  }
  
  //! Make a GPU state-propagator manager
  std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t&     pme,
-                                                                   const DeviceContext& deviceContext)
+                                                                   const DeviceContext* deviceContext,
+                                                                   const DeviceStream* deviceStream)
  {
      // TODO: Pin the host buffer and use async memory copies
      // TODO: Special constructor for PME-only rank / PME-tests is used here. There should be a mechanism to
      //       restrict one from using other constructor here.
-    return std::make_unique<StatePropagatorDataGpu>(pme_gpu_get_device_stream(&pme), deviceContext,
-                                                    GpuApiCallBehavior::Sync,
+    return std::make_unique<StatePropagatorDataGpu>(deviceStream, *deviceContext, GpuApiCallBehavior::Sync,
                                                      pme_gpu_get_block_size(&pme), nullptr);
  }
  
diff --git a/src/gromacs/ewald/tests/pmetestcommon.h b/src/gromacs/ewald/tests/pmetestcommon.h

index 98a2bbd4d2106092219be25eb8b8cffc6c9e22df..7f2e727c5f302ec2c9ac8185f71f050877627349 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.h
+++ b/src/gromacs/ewald/tests/pmetestcommon.h
@@ -55,6 +55,8 @@
  
  namespace gmx
  {
+
+class DeviceStreamManager;
  namespace test
  {
  
@@ -118,26 +120,31 @@ uint64_t getSplineModuliDoublePrecisionUlps(int splineOrder);
  // PME stages
  
  //! PME initialization
-PmeSafePointer pmeInitWrapper(const t_inputrec*        inputRec,
-                              CodePath                 mode,
-                              const DeviceInformation* deviceInfo,
-                              const PmeGpuProgram*     pmeGpuProgram,
-                              const Matrix3x3&         box,
-                              real                     ewaldCoeff_q  = 1.0F,
-                              real                     ewaldCoeff_lj = 1.0F);
+PmeSafePointer pmeInitWrapper(const t_inputrec*    inputRec,
+                              CodePath             mode,
+                              const DeviceContext* deviceContext,
+                              const DeviceStream*  deviceStream,
+                              const PmeGpuProgram* pmeGpuProgram,
+                              const Matrix3x3&     box,
+                              real                 ewaldCoeff_q  = 1.0F,
+                              real                 ewaldCoeff_lj = 1.0F);
  //! Simple PME initialization (no atom data)
-PmeSafePointer pmeInitEmpty(const t_inputrec*        inputRec,
-                            CodePath                 mode,
-                            const DeviceInformation* deviceInfo,
-                            const PmeGpuProgram*     pmeGpuProgram,
-                            const Matrix3x3&         box,
-                            real                     ewaldCoeff_q,
-                            real                     ewaldCoeff_lj);
+PmeSafePointer pmeInitEmpty(const t_inputrec*    inputRec,
+                            CodePath             mode,
+                            const DeviceContext* deviceContext,
+                            const DeviceStream*  deviceStream,
+                            const PmeGpuProgram* pmeGpuProgram,
+                            const Matrix3x3&     box,
+                            real                 ewaldCoeff_q,
+                            real                 ewaldCoeff_lj);
+
  //! Simple PME initialization based on inputrec only
  PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec);
+
  //! Make a GPU state-propagator manager
  std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t&     pme,
-                                                                   const DeviceContext& deviceContext);
+                                                                   const DeviceContext* deviceContext,
+                                                                   const DeviceStream* deviceStream);
  //! PME initialization with atom data and system box
  void pmeInitAtoms(gmx_pme_t*               pme,
                    StatePropagatorDataGpu*  stateGpu,
diff --git a/src/gromacs/ewald/tests/testhardwarecontext.cpp b/src/gromacs/ewald/tests/testhardwarecontext.cpp

new file mode 100644 (file)

index 0000000..6e2c455
--- /dev/null
+++ b/src/gromacs/ewald/tests/testhardwarecontext.cpp
@@ -0,0 +1,124 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Implements test environment class which performs hardware enumeration for unit tests.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_ewald
+ */
+
+#include "gmxpre.h"
+
+#include "testhardwarecontext.h"
+
+#include <memory>
+
+#include "gromacs/ewald/pme.h"
+#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/detecthardware.h"
+#include "gromacs/hardware/hw_info.h"
+#include "gromacs/utility/basenetwork.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/loggerbuilder.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
+
+namespace gmx
+{
+namespace test
+{
+
+TestHardwareContext::TestHardwareContext(CodePath codePath, const char* description) :
+    codePath_(codePath),
+    description_(description)
+{
+    GMX_RELEASE_ASSERT(codePath == CodePath::CPU,
+                       "A GPU code path should provide DeviceInformation to the "
+                       "TestHerdwareContext constructor.");
+    deviceContext_ = nullptr;
+    deviceStream_  = nullptr;
+}
+
+TestHardwareContext::TestHardwareContext(CodePath                 codePath,
+                                         const char*              description,
+                                         const DeviceInformation& deviceInfo) :
+    codePath_(codePath),
+    description_(description)
+{
+    GMX_RELEASE_ASSERT(codePath == CodePath::GPU,
+                       "TestHardwareContext tries to construct DeviceContext and PmeGpuProgram "
+                       "in CPU build.");
+    deviceContext_ = new DeviceContext(deviceInfo);
+    deviceStream_  = new DeviceStream(*deviceContext_, DeviceStreamPriority::Normal, false);
+    program_       = buildPmeGpuProgram(*deviceContext_);
+}
+
+TestHardwareContext::~TestHardwareContext()
+{
+    delete (deviceStream_);
+    delete (deviceContext_);
+}
+
+const DeviceInformation* TestHardwareContext::deviceInfo() const
+{
+    return &deviceContext_->deviceInfo();
+}
+
+const DeviceContext* TestHardwareContext::deviceContext() const
+{
+    return deviceContext_;
+}
+//! Get the device stream
+const DeviceStream* TestHardwareContext::deviceStream() const
+{
+    return deviceStream_;
+}
+
+const char* codePathToString(CodePath codePath)
+{
+    switch (codePath)
+    {
+        case CodePath::CPU: return "CPU";
+        case CodePath::GPU: return "GPU";
+        default: GMX_THROW(NotImplementedError("This CodePath should support codePathToString"));
+    }
+}
+
+} // namespace test
+} // namespace gmx
diff --git a/src/gromacs/ewald/tests/testhardwarecontext.h b/src/gromacs/ewald/tests/testhardwarecontext.h

new file mode 100644 (file)

index 0000000..fa5ebd9
--- /dev/null
+++ b/src/gromacs/ewald/tests/testhardwarecontext.h
@@ -0,0 +1,112 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_EWALD_TEST_HARDWARE_CONTEXT_H
+#define GMX_EWALD_TEST_HARDWARE_CONTEXT_H
+
+/*! \internal \file
+ * \brief
+ * Describes test environment class which performs hardware enumeration for unit tests.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "gromacs/ewald/pme_gpu_program.h"
+#include "gromacs/utility/gmxassert.h"
+
+class DeviceContext;
+struct DeviceInformation;
+class DeviceStream;
+
+namespace gmx
+{
+namespace test
+{
+//! Hardware code path being tested
+enum class CodePath
+{
+    CPU,
+    GPU
+};
+
+//! Return a string useful for human-readable messages describing a \c codePath.
+const char* codePathToString(CodePath codePath);
+
+/*! \internal \brief
+ * A structure to describe a hardware context  that persists over the lifetime
+ * of the test binary - an abstraction over PmeGpuProgram with a human-readable string.
+ */
+struct TestHardwareContext
+{
+    //! Hardware path for the code being tested.
+    CodePath codePath_;
+    //! Readable description
+    std::string description_;
+    //! Device context
+    DeviceContext* deviceContext_ = nullptr;
+    //! Device stream
+    DeviceStream* deviceStream_ = nullptr;
+    //! Persistent compiled GPU kernels for PME.
+    PmeGpuProgramStorage program_;
+
+public:
+    //! Retuns the code path for this context.
+    CodePath codePath() const { return codePath_; }
+    //! Returns a human-readable context description line
+    std::string description() const { return description_; }
+    //! Returns the device info pointer
+    const DeviceInformation* deviceInfo() const;
+    //! Get the device context
+    const DeviceContext* deviceContext() const;
+    //! Get the device stream
+    const DeviceStream* deviceStream() const;
+    //! Returns the persistent PME GPU kernels
+    const PmeGpuProgram* pmeGpuProgram() const { return program_.get(); }
+    //! Constructs the context for CPU builds
+    TestHardwareContext(CodePath codePath, const char* description);
+    //! Constructs the context for GPU builds
+    TestHardwareContext(CodePath codePath, const char* description, const DeviceInformation& deviceInfo);
+    //! Destructor
+    ~TestHardwareContext();
+};
+
+} // namespace test
+} // namespace gmx
+#endif
diff --git a/src/gromacs/ewald/tests/testhardwarecontexts.cpp b/src/gromacs/ewald/tests/testhardwarecontexts.cpp

index 661f0fa4bb8c448d10432cdbf4dc17c204c8dc7d..96f36f9810899dc9a7ef0d492c9fe7587deb0bb9 100644 (file)
--- a/src/gromacs/ewald/tests/testhardwarecontexts.cpp
+++ b/src/gromacs/ewald/tests/testhardwarecontexts.cpp
@@ -60,18 +60,6 @@ namespace gmx
  namespace test
  {
  
-TestHardwareContext::~TestHardwareContext() = default;
-
-const char* codePathToString(CodePath codePath)
-{
-    switch (codePath)
-    {
-        case CodePath::CPU: return "CPU";
-        case CodePath::GPU: return "GPU";
-        default: GMX_THROW(NotImplementedError("This CodePath should support codePathToString"));
-    }
-}
-
  /* Implements the "construct on first use" idiom to avoid any static
   * initialization order fiasco.
   *
@@ -120,8 +108,6 @@ void PmeTestEnvironment::SetUp()
      for (int gpuIndex : getCompatibleGpus(hardwareInfo_->gpu_info))
      {
          const DeviceInformation* deviceInfo = getDeviceInfo(hardwareInfo_->gpu_info, gpuIndex);
-        GMX_RELEASE_ASSERT(deviceInfo != nullptr,
-                           "Device information should be provided for the GPU builds.");
          init_gpu(deviceInfo);
  
          char stmp[200] = {};
@@ -132,5 +118,10 @@ void PmeTestEnvironment::SetUp()
      }
  }
  
+void PmeTestEnvironment::TearDown()
+{
+    hardwareContexts_.clear();
+}
+
  } // namespace test
  } // namespace gmx
diff --git a/src/gromacs/ewald/tests/testhardwarecontexts.h b/src/gromacs/ewald/tests/testhardwarecontexts.h

index 0af2343795683cd88580bb2955b71a5bb2eebd0b..6a1450fe79b810074ec4268222e540bcb5ea3cf3 100644 (file)
--- a/src/gromacs/ewald/tests/testhardwarecontexts.h
+++ b/src/gromacs/ewald/tests/testhardwarecontexts.h
@@ -49,83 +49,17 @@
  #include <gtest/gtest.h>
  
  #include "gromacs/ewald/pme_gpu_program.h"
-#include "gromacs/gpu_utils/device_context.h"
  #include "gromacs/hardware/gpu_hw_info.h"
  #include "gromacs/utility/gmxassert.h"
  
+#include "testhardwarecontext.h"
+
  struct gmx_hw_info_t;
  
  namespace gmx
  {
  namespace test
  {
-//! Hardware code path being tested
-enum class CodePath
-{
-    CPU,
-    GPU
-};
-
-//! Return a string useful for human-readable messages describing a \c codePath.
-const char* codePathToString(CodePath codePath);
-
-/*! \internal \brief
- * A structure to describe a hardware context  that persists over the lifetime
- * of the test binary - an abstraction over PmeGpuProgram with a human-readable string.
- */
-struct TestHardwareContext
-{
-    //! Hardware path for the code being tested.
-    CodePath codePath_;
-    //! Readable description
-    std::string description_;
-    //! Device information pointer
-    const DeviceInformation* deviceInfo_;
-    //! Local copy of the device context pointer
-    std::unique_ptr<DeviceContext> deviceContext_;
-    //! Persistent compiled GPU kernels for PME.
-    PmeGpuProgramStorage program_;
-
-public:
-    //! Retuns the code path for this context.
-    CodePath getCodePath() const { return codePath_; }
-    //! Returns a human-readable context description line
-    std::string getDescription() const { return description_; }
-    //! Getter for the DeviceContext
-    const DeviceContext& deviceContext() const
-    {
-        GMX_RELEASE_ASSERT(deviceContext_ != nullptr,
-                           "Trying to get device context before it was initialized or in builds "
-                           "without GPU support.");
-        return *deviceContext_;
-    }
-    //! Returns the device info pointer
-    const DeviceInformation* getDeviceInfo() const { return deviceInfo_; }
-    //! Returns the persistent PME GPU kernels
-    const PmeGpuProgram* getPmeGpuProgram() const { return program_.get(); }
-    //! Constructs the context for CPU builds
-    TestHardwareContext(CodePath codePath, const char* description) :
-        codePath_(codePath),
-        description_(description)
-    {
-        GMX_RELEASE_ASSERT(codePath == CodePath::CPU,
-                           "A GPU code path should provide DeviceInformation to the "
-                           "TestHardwareContext constructor.");
-    }
-    //! Constructs the context for GPU builds
-    TestHardwareContext(CodePath codePath, const char* description, const DeviceInformation& deviceInfo) :
-        codePath_(codePath),
-        description_(description),
-        deviceInfo_(&deviceInfo)
-    {
-        GMX_RELEASE_ASSERT(codePath == CodePath::GPU,
-                           "TestHardwareContext tries to construct DeviceContext and PmeGpuProgram "
-                           "in CPU build.");
-        deviceContext_ = std::make_unique<DeviceContext>(deviceInfo);
-        program_       = buildPmeGpuProgram(*deviceContext_);
-    }
-    ~TestHardwareContext();
-};
  
  //! A container of handles to hardware contexts
  typedef std::vector<std::unique_ptr<TestHardwareContext>> TestHardwareContexts;
@@ -144,6 +78,8 @@ private:
  public:
      //! This is called by GTest framework once to query the hardware
      void SetUp() override;
+    //! This is called by GTest framework once release the hardware
+    void TearDown() override;
      //! Get available hardware contexts.
      const TestHardwareContexts& getHardwareContexts() const { return hardwareContexts_; }
      //! Get available hardware information.
diff --git a/src/gromacs/gpu_utils/device_stream.cpp b/src/gromacs/gpu_utils/device_stream.cpp

index 7f05de6b321999f905dcf7a3afda9174dc7a3eec..bfbe049235fdd571bf844c689047a990770532f7 100644 (file)
--- a/src/gromacs/gpu_utils/device_stream.cpp
+++ b/src/gromacs/gpu_utils/device_stream.cpp
@@ -59,4 +59,4 @@ bool DeviceStream::isValid() const
      return false;
  }
  
-void DeviceStream::synchronize() const {}
+void DeviceStream::synchronize() const {};
diff --git a/src/gromacs/gpu_utils/device_stream_manager.cpp b/src/gromacs/gpu_utils/device_stream_manager.cpp

index 1c8228e5902413616790adc6df20cbe4b0f5abee..8c7457a3d3b0da8c8b44b5f4ddef87632a14f162 100644 (file)
--- a/src/gromacs/gpu_utils/device_stream_manager.cpp
+++ b/src/gromacs/gpu_utils/device_stream_manager.cpp
@@ -156,6 +156,24 @@ const DeviceStream& DeviceStreamManager::stream(DeviceStreamType streamToGet) co
      return impl_->streams_[streamToGet];
  }
  
+const DeviceStream& DeviceStreamManager::bondedStream(bool hasPPDomainDecomposition) const
+{
+    if (hasPPDomainDecomposition)
+    {
+        GMX_RELEASE_ASSERT(stream(DeviceStreamType::NonBondedNonLocal).isValid(),
+                           "GPU non-bonded non-local stream should be valid in order to use GPU "
+                           "version of bonded forces with domain decomposition.");
+        return stream(DeviceStreamType::NonBondedNonLocal);
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(stream(DeviceStreamType::NonBondedLocal).isValid(),
+                           "GPU non-bonded local stream should be valid in order to use GPU "
+                           "version of bonded forces without domain decomposition.");
+        return stream(DeviceStreamType::NonBondedLocal);
+    }
+}
+
  bool DeviceStreamManager::streamIsValid(DeviceStreamType streamToCheck) const
  {
      return impl_->streams_[streamToCheck].isValid();
diff --git a/src/gromacs/gpu_utils/device_stream_manager.h b/src/gromacs/gpu_utils/device_stream_manager.h

index 4cfa6161a429c46d1426c6dd65338cedc931904b..4565d1ac0c2044633420536cd4aecfffea5e07d6 100644 (file)
--- a/src/gromacs/gpu_utils/device_stream_manager.h
+++ b/src/gromacs/gpu_utils/device_stream_manager.h
@@ -124,6 +124,12 @@ public:
       */
      const DeviceStream& stream(DeviceStreamType streamToGet) const;
  
+    /*! \brief Returns a handle to the GPU stream to compute bonded forces in.
+     *
+     * \param[in] hasPPDomainDecomposition Whether there is a particle-particle domain decomposition.
+     */
+    const DeviceStream& bondedStream(bool hasPPDomainDecomposition) const;
+
      /*! \brief Return whether the requested GPU stream is valid for use.
       *
       * \param[in] streamToCheck Which stream to check.
diff --git a/src/gromacs/gpu_utils/tests/device_stream_manager.cpp b/src/gromacs/gpu_utils/tests/device_stream_manager.cpp

index 1c0330e02a1be2749f176f8c383b79d0990b0fdc..e75c80bc079970e28597aa038e0fe18f775c9800 100644 (file)
--- a/src/gromacs/gpu_utils/tests/device_stream_manager.cpp
+++ b/src/gromacs/gpu_utils/tests/device_stream_manager.cpp
@@ -171,7 +171,6 @@ TEST_F(DeviceStreamManagerTest, CorrectStreamsAreReturnedOnNonbondedDevice)
              expectValidStreams(&manager, { DeviceStreamType::Pme, DeviceStreamType::NonBondedLocal,
                                             DeviceStreamType::NonBondedNonLocal, DeviceStreamType::PmePpTransfer,
                                             DeviceStreamType::UpdateAndConstraints });
-            expectInvalidStreams(&manager, {});
          }
  
          {
diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu

index 0d5367f698f54e46a8e13a112dc852130f1fa2a2..faa775a0f7d9cd62529b9af9a92dffc480bcc4a1 100644 (file)
--- a/src/gromacs/listed_forces/gpubonded_impl.cu
+++ b/src/gromacs/listed_forces/gpubonded_impl.cu
@@ -51,6 +51,7 @@
  #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
  #include "gromacs/gpu_utils/cudautils.cuh"
  #include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
  #include "gromacs/gpu_utils/devicebuffer.h"
  #include "gromacs/gpu_utils/typecasts.cuh"
  #include "gromacs/mdtypes/enerdata.h"
@@ -71,6 +72,9 @@ GpuBonded::Impl::Impl(const gmx_ffparams_t& ffparams,
      deviceContext_(deviceContext),
      deviceStream_(deviceStream)
  {
+    GMX_RELEASE_ASSERT(deviceStream.isValid(),
+                       "Can't run GPU version of bonded forces in stream that is not valid.");
+
      wcycle_ = wcycle;
  
      allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), deviceContext_);
@@ -81,7 +85,7 @@ GpuBonded::Impl::Impl(const gmx_ffparams_t& ffparams,
                         deviceStream_, GpuApiCallBehavior::Sync, nullptr);
      vTot_.resize(F_NRE);
      allocateDeviceBuffer(&d_vTot_, F_NRE, deviceContext_);
-    clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream);
+    clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream_);
  
      kernelParams_.d_forceParams = d_forceParams_;
      kernelParams_.d_xq          = d_xq_;
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu

index 562c1be500972fa717e733dfe24d1b7d6a271f9d..76899bbd82c21ed484d8f7f4977b524d62f90295 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
@@ -58,6 +58,7 @@
  
  #include "gromacs/gpu_utils/cudautils.cuh"
  #include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
  #include "gromacs/gpu_utils/devicebuffer.h"
  #include "gromacs/gpu_utils/gputraits.cuh"
  #include "gromacs/gpu_utils/vectype_ops.cuh"
@@ -121,8 +122,8 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer*             fRead
      // Make sure that the forces are ready on device before proceeding with the update.
      fReadyOnDevice->enqueueWaitEvent(deviceStream_);
  
-    // The integrate should save a copy of the current coordinates in d_xp_ and write updated once
-    // into d_x_. The d_xp_ is only needed by constraints.
+    // The integrate should save a copy of the current coordinates in d_xp_ and write updated
+    // once into d_x_. The d_xp_ is only needed by constraints.
      integrator_->integrate(d_x_, d_xp_, d_v_, d_f_, dt, doTemperatureScaling, tcstat,
                             doParrinelloRahman, dtPressureCouple, prVelocityScalingMatrix);
      // Constraints need both coordinates before (d_x_) and after (d_xp_) update. However, after constraints
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp

index 9d341e1eaeb646bd71d72d14b0c0d043013596cc..f7f39bbee79329875fdac34f78629b438f99841f 100644 (file)
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -67,6 +67,7 @@
  #include "gromacs/fileio/trxio.h"
  #include "gromacs/gmxlib/network.h"
  #include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/imd/imd.h"
  #include "gromacs/listed_forces/manage_threading.h"
@@ -353,6 +354,7 @@ void gmx::LegacySimulator::do_md()
  
      StatePropagatorDataGpu* stateGpu = fr->stateGpu;
  
+    // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
      if (useGpuForUpdate)
      {
          GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
@@ -397,14 +399,17 @@ void gmx::LegacySimulator::do_md()
          {
              GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
          }
-
-        GMX_RELEASE_ASSERT(fr->deviceContext != nullptr,
-                           "GPU device context should be initialized to use GPU update.");
-        GMX_RELEASE_ASSERT(stateGpu->getUpdateStream() != nullptr,
-                           "Update stream can not be nullptr when update is on a GPU.");
-        integrator = std::make_unique<UpdateConstrainGpu>(*ir, *top_global, *fr->deviceContext,
-                                                          *stateGpu->getUpdateStream(),
-                                                          stateGpu->xUpdatedOnDevice());
+        GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
+                           "Device stream manager should be initialized in order to use GPU "
+                           "update-constraints.");
+        GMX_RELEASE_ASSERT(
+                fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
+                "Update stream should be initialized in order to use GPU "
+                "update-constraints.");
+        integrator = std::make_unique<UpdateConstrainGpu>(
+                *ir, *top_global, fr->deviceStreamManager->context(),
+                fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
+                stateGpu->xUpdatedOnDevice());
  
          integrator->setPbc(PbcType::Xyz, state->box);
      }
@@ -864,21 +869,11 @@ void gmx::LegacySimulator::do_md()
                  if (havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange
                      && useGpuForNonbonded && is1D(*cr->dd))
                  {
+                    GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
+                                       "GPU device manager has to be initialized to use GPU "
+                                       "version of halo exchange.");
                      // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
-                    const DeviceStream* localStream =
-                            Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
-                    const DeviceStream* nonLocalStream = Nbnxm::gpu_get_command_stream(
-                            fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
-                    GMX_RELEASE_ASSERT(
-                            fr->deviceContext != nullptr,
-                            "GPU device context should be initialized to use GPU halo exchange.");
-                    GMX_RELEASE_ASSERT(localStream != nullptr,
-                                       "Local non-bonded stream can't be nullptr when using GPU "
-                                       "halo exchange.");
-                    GMX_RELEASE_ASSERT(nonLocalStream != nullptr,
-                                       "Non-local non-bonded stream can't be nullptr when using "
-                                       "GPU halo exchange.");
-                    constructGpuHaloExchange(mdlog, *cr, *fr->deviceContext, *localStream, *nonLocalStream);
+                    constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager);
                  }
              }
          }
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 604ff0ed7bf7c2ed50829d0c830c166358f23359..43ac7d6164a7ddf3aef5cb1d03d089103fc4c851 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -74,6 +74,7 @@
  #include "gromacs/gmxlib/network.h"
  #include "gromacs/gmxlib/nrnb.h"
  #include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/hardware/cpuinfo.h"
  #include "gromacs/hardware/detecthardware.h"
@@ -1141,19 +1142,23 @@ int Mdrunner::mdrunner()
              EEL_PME(inputrec->coulombtype) && thisRankHasDuty(cr, DUTY_PME));
  
      // Get the device handles for the modules, nullptr when no task is assigned.
-    int                            deviceId      = -1;
-    DeviceInformation*             deviceInfo    = gpuTaskAssignments.initDevice(&deviceId);
-    std::unique_ptr<DeviceContext> deviceContext = nullptr;
-    if (deviceInfo != nullptr)
+    int                deviceId   = -1;
+    DeviceInformation* deviceInfo = gpuTaskAssignments.initDevice(&deviceId);
+
+    // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?)
+    bool useTiming = true;
+    if (GMX_GPU == GMX_GPU_CUDA)
      {
-        if (DOMAINDECOMP(cr) && thisRankHasDuty(cr, DUTY_PP))
-        {
-            dd_setup_dlb_resource_sharing(cr, deviceId);
-        }
-        deviceContext = std::make_unique<DeviceContext>(*deviceInfo);
+        /* WARNING: CUDA timings are incorrect with multiple streams.
+         *          This is the main reason why they are disabled by default.
+         */
+        // TODO: Consider turning on by default when we can detect nr of streams.
+        useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
+    }
+    else if (GMX_GPU == GMX_GPU_OPENCL)
+    {
+        useTiming = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
      }
-
-    // TODO Initialize GPU streams here.
  
      // TODO Currently this is always built, yet DD partition code
      // checks if it is built before using it. Probably it should
@@ -1190,6 +1195,19 @@ int Mdrunner::mdrunner()
      const bool printHostName = (cr->nnodes > 1);
      gpuTaskAssignments.reportGpuUsage(mdlog, printHostName, useGpuForBonded, pmeRunMode, useGpuForUpdate);
  
+    std::unique_ptr<DeviceStreamManager> deviceStreamManager = nullptr;
+
+    if (deviceInfo != nullptr)
+    {
+        if (DOMAINDECOMP(cr) && thisRankHasDuty(cr, DUTY_PP))
+        {
+            dd_setup_dlb_resource_sharing(cr, deviceId);
+        }
+        deviceStreamManager = std::make_unique<DeviceStreamManager>(
+                *deviceInfo, useGpuForPme, useGpuForNonbonded, havePPDomainDecomposition(cr),
+                useGpuForUpdate, useTiming);
+    }
+
      // If the user chose a task assignment, give them some hints
      // where appropriate.
      if (!userGpuTaskAssignment.empty())
@@ -1348,32 +1366,36 @@ int Mdrunner::mdrunner()
                        opt2fn("-tablep", filenames.size(), filenames.data()),
                        opt2fns("-tableb", filenames.size(), filenames.data()), pforce);
  
-        fr->deviceContext = deviceContext.get();
+        // Save a handle to device stream manager to use elsewhere in the code
+        // TODO: Forcerec is not a correct place to store it.
+        fr->deviceStreamManager = deviceStreamManager.get();
  
          if (devFlags.enableGpuPmePPComm && !thisRankHasDuty(cr, DUTY_PME))
          {
              GMX_RELEASE_ASSERT(
-                    deviceContext != nullptr,
-                    "Device context can not be nullptr when PME-PP direct communications object.");
+                    deviceStreamManager != nullptr,
+                    "GPU device stream manager should be valid in order to use PME-PP direct "
+                    "communications.");
+            GMX_RELEASE_ASSERT(
+                    deviceStreamManager->streamIsValid(DeviceStreamType::PmePpTransfer),
+                    "GPU PP-PME stream should be valid in order to use GPU PME-PP direct "
+                    "communications.");
              fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(
-                    cr->mpi_comm_mysim, cr->dd->pme_nodeid, *deviceContext);
+                    cr->mpi_comm_mysim, cr->dd->pme_nodeid, deviceStreamManager->context(),
+                    deviceStreamManager->stream(DeviceStreamType::PmePpTransfer));
          }
  
-        fr->nbv = Nbnxm::init_nb_verlet(mdlog, inputrec, fr, cr, *hwinfo, deviceInfo,
-                                        fr->deviceContext, &mtop, box, wcycle);
+        fr->nbv = Nbnxm::init_nb_verlet(mdlog, inputrec, fr, cr, *hwinfo, useGpuForNonbonded,
+                                        deviceStreamManager.get(), &mtop, box, wcycle);
+        // TODO: Move the logic below to a GPU bonded builder
          if (useGpuForBonded)
          {
-            auto stream = havePPDomainDecomposition(cr)
-                                  ? Nbnxm::gpu_get_command_stream(
-                                            fr->nbv->gpu_nbv, gmx::InteractionLocality::NonLocal)
-                                  : Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv,
-                                                                  gmx::InteractionLocality::Local);
-            GMX_RELEASE_ASSERT(
-                    fr->deviceContext != nullptr,
-                    "Device context can not be nullptr when computing bonded interactions on GPU.");
-            GMX_RELEASE_ASSERT(stream != nullptr,
-                               "Can'r run GPU version of bonded forces in nullptr stream.");
-            gpuBonded = std::make_unique<GpuBonded>(mtop.ffparams, *fr->deviceContext, *stream, wcycle);
+            GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
+                               "GPU device stream manager should be valid in order to use GPU "
+                               "version of bonded forces.");
+            gpuBonded = std::make_unique<GpuBonded>(
+                    mtop.ffparams, deviceStreamManager->context(),
+                    deviceStreamManager->bondedStream(havePPDomainDecomposition(cr)), wcycle);
              fr->gpuBonded = gpuBonded.get();
          }
  
@@ -1450,9 +1472,11 @@ int Mdrunner::mdrunner()
      if (thisRankHasPmeGpuTask)
      {
          GMX_RELEASE_ASSERT(
-                deviceContext != nullptr,
-                "Device context can not be nullptr when building PME GPU program object.");
-        pmeGpuProgram = buildPmeGpuProgram(*deviceContext);
+                (deviceStreamManager != nullptr),
+                "GPU device stream manager should be initialized in order to use GPU for PME.");
+        GMX_RELEASE_ASSERT((deviceInfo != nullptr),
+                           "GPU device should be initialized in order to use GPU for PME.");
+        pmeGpuProgram = buildPmeGpuProgram(deviceStreamManager->context());
      }
  
      /* Initiate PME if necessary,
@@ -1478,10 +1502,23 @@ int Mdrunner::mdrunner()
          {
              try
              {
+                // TODO: This should be in the builder.
+                GMX_RELEASE_ASSERT(!useGpuForPme || (deviceStreamManager != nullptr),
+                                   "Device stream manager should be valid in order to use GPU "
+                                   "version of PME.");
+                GMX_RELEASE_ASSERT(
+                        !useGpuForPme || deviceStreamManager->streamIsValid(DeviceStreamType::Pme),
+                        "GPU PME stream should be valid in order to use GPU version of PME.");
+
+                const DeviceContext* deviceContext =
+                        useGpuForPme ? &deviceStreamManager->context() : nullptr;
+                const DeviceStream* pmeStream =
+                        useGpuForPme ? &deviceStreamManager->stream(DeviceStreamType::Pme) : nullptr;
+
                  pmedata = gmx_pme_init(cr, getNumPmeDomains(cr->dd), inputrec, nChargePerturbed != 0,
                                         nTypePerturbed != 0, mdrunOptions.reproducible, ewaldcoeff_q,
                                         ewaldcoeff_lj, gmx_omp_nthreads_get(emntPME), pmeRunMode,
-                                       nullptr, deviceInfo, pmeGpuProgram.get(), mdlog);
+                                       nullptr, deviceContext, pmeStream, pmeGpuProgram.get(), mdlog);
              }
              GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
          }
@@ -1581,24 +1618,13 @@ int Mdrunner::mdrunner()
              && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME))
                  || runScheduleWork.simulationWork.useGpuBufferOps))
          {
-            const DeviceStream* pmeStream = pme_gpu_get_device_stream(fr->pmedata);
-            const DeviceStream* localStream =
-                    fr->nbv->gpu_nbv != nullptr
-                            ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local)
-                            : nullptr;
-            const DeviceStream* nonLocalStream =
-                    fr->nbv->gpu_nbv != nullptr
-                            ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal)
-                            : nullptr;
              GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
                                                        ? GpuApiCallBehavior::Async
                                                        : GpuApiCallBehavior::Sync;
-            GMX_RELEASE_ASSERT(
-                    deviceContext != nullptr,
-                    "Device context can not be nullptr when building GPU propagator data object.");
+            GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
+                               "GPU device stream manager should be initialized to use GPU.");
              stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
-                    pmeStream, localStream, nonLocalStream, *deviceContext, transferKind,
-                    pme_gpu_get_block_size(fr->pmedata), wcycle);
+                    *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle);
              fr->stateGpu = stateGpu.get();
          }
  
@@ -1634,7 +1660,7 @@ int Mdrunner::mdrunner()
          /* do PME only */
          walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
          gmx_pmeonly(pmedata, cr, &nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode,
-                    deviceContext.get());
+                    deviceStreamManager.get());
      }
  
      wallcycle_stop(wcycle, ewcRUN);
@@ -1648,6 +1674,7 @@ int Mdrunner::mdrunner()
      // clean up cycle counter
      wallcycle_destroy(wcycle);
  
+    deviceStreamManager.reset(nullptr);
      // Free PME data
      if (pmedata)
      {
@@ -1695,7 +1722,6 @@ int Mdrunner::mdrunner()
      }
  
      free_gpu(deviceInfo);
-    deviceContext.reset(nullptr);
      sfree(fcd);
  
      if (doMembed)
@@ -1732,7 +1758,7 @@ int Mdrunner::mdrunner()
      }
  #endif
      return rc;
-}
+} // namespace gmx
  
  Mdrunner::~Mdrunner()
  {
diff --git a/src/gromacs/mdtypes/forcerec.h b/src/gromacs/mdtypes/forcerec.h

index f518692e7d037bf52f00c5cd36741ff8226c8fab..169cb1c15348833fcb1159768fafd71a3adf0d49 100644 (file)
--- a/src/gromacs/mdtypes/forcerec.h
+++ b/src/gromacs/mdtypes/forcerec.h
@@ -59,6 +59,7 @@ struct t_QMMMrec;
  
  namespace gmx
  {
+class DeviceStreamManager;
  class GpuBonded;
  class ForceProviders;
  class StatePropagatorDataGpu;
@@ -284,6 +285,8 @@ struct t_forcerec
      // TODO: This is not supposed to be here. StatePropagatorDataGpu should be a part of
      //       general StatePropagatorData object that is passed around
      gmx::StatePropagatorDataGpu* stateGpu = nullptr;
+    // TODO: Should not be here. This is here only to pass the pointer around.
+    gmx::DeviceStreamManager* deviceStreamManager = nullptr;
  
      //! GPU device context
      DeviceContext* deviceContext = nullptr;
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h

index a4f77cbf164d2fcd36f5d0ae25c934bfb2955f6f..bcc6dc5c384bbad413b65489b53301e3b9d4f58b 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -66,6 +66,7 @@ struct gmx_wallcycle;
  
  namespace gmx
  {
+class DeviceStreamManager;
  
  class StatePropagatorDataGpu
  {
@@ -86,35 +87,15 @@ public:
       * ops are offloaded. This feature is currently not available in OpenCL and
       * hence these streams are not set in these builds.
       *
-     * \note In CUDA, the update stream is created in the constructor as a temporary
-     *       solution, in place until the stream manager is introduced.
-     *       Note that this makes it impossible to construct this object in CUDA
-     *       builds executing on a host without any CUDA-capable device available.
-     *
-     * \note In CUDA, \p deviceContext is unused, hence always nullptr;
-     *       all stream arguments can also be nullptr in runs where the
-     *       respective streams are not required.
-     *       In OpenCL, \p deviceContext needs to be a valid device context.
-     *       In OpenCL runs StatePropagatorDataGpu is currently only used
-     *       with PME offload, and only on ranks with PME duty. Hence, the
-     *       \p pmeStream argument needs to be a valid OpenCL queue object
-     *       which must have been created in \p deviceContext.
-     *
-     *  \param[in] pmeStream       Device PME stream, nullptr allowed.
-     *  \param[in] localStream     Device NBNXM local stream, nullptr allowed.
-     *  \param[in] nonLocalStream  Device NBNXM non-local stream, nullptr allowed.
-     *  \param[in] deviceContext   Device context, nullptr allowed.
-     *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
+     *  \param[in] deviceStreamManager         Object that owns the DeviceContext and DeviceStreams.
+     *  \param[in] transferKind                H2D/D2H transfer call behavior (synchronous or not).
       *  \param[in] allocationBlockSizeDivisor  Deterines padding size for coordinates buffer.
-     *  \param[in] wcycle          Wall cycle counter data.
+     *  \param[in] wcycle                      Wall cycle counter data.
       */
-    StatePropagatorDataGpu(const DeviceStream*  pmeStream,
-                           const DeviceStream*  localStream,
-                           const DeviceStream*  nonLocalStream,
-                           const DeviceContext& deviceContext,
-                           GpuApiCallBehavior   transferKind,
-                           int                  allocationBlockSizeDivisor,
-                           gmx_wallcycle*       wcycle);
+    StatePropagatorDataGpu(const DeviceStreamManager& deviceStreamManager,
+                           GpuApiCallBehavior         transferKind,
+                           int                        allocationBlockSizeDivisor,
+                           gmx_wallcycle*             wcycle);
  
      /*! \brief Constructor to use in PME-only rank and in tests.
       *
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp

index 68c884f99b6e448adba3205219ac68b20e1f978a..269b6eb3778650061ccbf9c82789e9da4b357350 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -54,10 +54,7 @@ class StatePropagatorDataGpu::Impl
  {
  };
  
-StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream       */,
-                                               const DeviceStream* /* localStream     */,
-                                               const DeviceStream* /* nonLocalStream  */,
-                                               const DeviceContext& /* deviceContext   */,
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStreamManager& /* deviceStreamManager */,
                                                 GpuApiCallBehavior /* transferKind    */,
                                                 int /* allocationBlockSizeDivisor */,
                                                 gmx_wallcycle* /*   wcycle */) :
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h

index b0576925479832645669ca9d2fd0e7e6e754ab60..fd9ff197addc5c3ee41dd21c7bfb9063802a4150 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -85,35 +85,15 @@ public:
       * ops are offloaded. This feature is currently not available in OpenCL and
       * hence these streams are not set in these builds.
       *
-     * \note In CUDA, the update stream is created in the constructor as a temporary
-     *       solution, in place until the stream manager is introduced.
-     *       Note that this makes it impossible to construct this object in CUDA
-     *       builds executing on a host without any CUDA-capable device available.
-     *
-     * \note In CUDA, \p deviceContext is unused, hence always nullptr;
-     *       all stream arguments can also be nullptr in runs where the
-     *       respective streams are not required.
-     *       In OpenCL, \p deviceContext needs to be a valid device context.
-     *       In OpenCL runs StatePropagatorDataGpu is currently only used
-     *       with PME offload, and only on ranks with PME duty. Hence, the
-     *       \p pmeStream argument needs to be a valid OpenCL queue object
-     *       which must have been created in \p deviceContext.
-     *
-     *  \param[in] pmeStream       Device PME stream, nullptr allowed.
-     *  \param[in] localStream     Device NBNXM local stream, nullptr allowed.
-     *  \param[in] nonLocalStream  Device NBNXM non-local stream, nullptr allowed.
-     *  \param[in] deviceContext   Device context, nullptr allowed.
-     *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
+     *  \param[in] deviceStreamManager         Object that owns the DeviceContext and DeviceStreams.
+     *  \param[in] transferKind                H2D/D2H transfer call behavior (synchronous or not).
       *  \param[in] allocationBlockSizeDivisor  Determines the padding size for coordinates buffer.
-     *  \param[in] wcycle          Wall cycle counter data.
+     *  \param[in] wcycle                      Wall cycle counter data.
       */
-    Impl(const DeviceStream*  pmeStream,
-         const DeviceStream*  localStream,
-         const DeviceStream*  nonLocalStream,
-         const DeviceContext& deviceContext,
-         GpuApiCallBehavior   transferKind,
-         int                  allocationBlockSizeDivisor,
-         gmx_wallcycle*       wcycle);
+    Impl(const DeviceStreamManager& deviceStreamManager,
+         GpuApiCallBehavior         transferKind,
+         int                        allocationBlockSizeDivisor,
+         gmx_wallcycle*             wcycle);
  
      /*! \brief Constructor to use in PME-only rank and in tests.
       *
@@ -346,9 +326,6 @@ private:
      //! GPU Update-constreaints stream.
      const DeviceStream* updateStream_;
  
-    //! An owning pointer to the update stream, in case we manage its lifetime here. Temporary.
-    DeviceStream updateStreamOwn_;
-
      // Streams to use for coordinates H2D and D2H copies (one event for each atom locality)
      EnumerationArray<AtomLocality, const DeviceStream*> xCopyStreams_ = { { nullptr } };
      // Streams to use for velocities H2D and D2H copies (one event for each atom locality)
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index e60e9fa73b7eb7ded130cccc5d27699d6c63da84..bf927f2da297084caae44ef98a499038fabbccc7 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -46,14 +46,9 @@
  
  #if GMX_GPU != GMX_GPU_NONE
  
-#    if GMX_GPU == GMX_GPU_CUDA
-#        include "gromacs/gpu_utils/cudautils.cuh"
-#    endif
+#    include "gromacs/gpu_utils/device_stream_manager.h"
  #    include "gromacs/gpu_utils/devicebuffer.h"
  #    include "gromacs/gpu_utils/gputraits.h"
-#    if GMX_GPU == GMX_GPU_OPENCL
-#        include "gromacs/gpu_utils/oclutils.h"
-#    endif
  #    include "gromacs/math/vectypes.h"
  #    include "gromacs/mdtypes/state_propagator_data_gpu.h"
  #    include "gromacs/timing/wallcycle.h"
@@ -65,55 +60,31 @@
  namespace gmx
  {
  
-StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
-                                   const DeviceStream*  localStream,
-                                   const DeviceStream*  nonLocalStream,
-                                   const DeviceContext& deviceContext,
-                                   GpuApiCallBehavior   transferKind,
-                                   int                  allocationBlockSizeDivisor,
-                                   gmx_wallcycle*       wcycle) :
-    deviceContext_(deviceContext),
+StatePropagatorDataGpu::Impl::Impl(const DeviceStreamManager& deviceStreamManager,
+                                   GpuApiCallBehavior         transferKind,
+                                   int                        allocationBlockSizeDivisor,
+                                   gmx_wallcycle*             wcycle) :
+    deviceContext_(deviceStreamManager.context()),
      transferKind_(transferKind),
      allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
      wcycle_(wcycle)
  {
-    static_assert(GMX_GPU != GMX_GPU_NONE,
-                  "This object should only be constructed on the GPU code-paths.");
+    static_assert(
+            GMX_GPU != GMX_GPU_NONE,
+            "GPU state propagator data object should only be constructed on the GPU code-paths.");
  
-    // TODO: Refactor when the StreamManager is introduced.
+    // We need to keep local copies for re-initialization.
+    pmeStream_      = &deviceStreamManager.stream(DeviceStreamType::Pme);
+    localStream_    = &deviceStreamManager.stream(DeviceStreamType::NonBondedLocal);
+    nonLocalStream_ = &deviceStreamManager.stream(DeviceStreamType::NonBondedNonLocal);
+    // PME stream is used in OpenCL for H2D coordinate transfer
      if (GMX_GPU == GMX_GPU_OPENCL)
      {
-        GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set in OpenCL builds.");
-
-        // The update stream is set to the PME stream in OpenCL, since PME stream is the only stream created in the PME context.
-        pmeStream_    = pmeStream;
-        updateStream_ = pmeStream;
-        GMX_UNUSED_VALUE(localStream);
-        GMX_UNUSED_VALUE(nonLocalStream);
+        updateStream_ = &deviceStreamManager.stream(DeviceStreamType::Pme);
      }
-
-    if (GMX_GPU == GMX_GPU_CUDA)
+    else
      {
-        if (pmeStream != nullptr)
-        {
-            pmeStream_ = pmeStream;
-        }
-        if (localStream != nullptr)
-        {
-            localStream_ = localStream;
-        }
-        if (nonLocalStream != nullptr)
-        {
-            nonLocalStream_ = nonLocalStream;
-        }
-
-        // TODO: The update stream should be created only when it is needed.
-#    if (GMX_GPU == GMX_GPU_CUDA)
-        // In CUDA we only need priority to create stream.
-        // (note that this will be moved from here in the follow-up patch)
-        updateStreamOwn_.init(deviceContext, DeviceStreamPriority::Normal, false);
-        updateStream_ = &updateStreamOwn_;
-#    endif
+        updateStream_ = &deviceStreamManager.stream(DeviceStreamType::UpdateAndConstraints);
      }
  
      // Map the atom locality to the stream that will be used for coordinates,
@@ -142,10 +113,11 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
      allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
      wcycle_(wcycle)
  {
-    static_assert(GMX_GPU != GMX_GPU_NONE,
-                  "This object should only be constructed on the GPU code-paths.");
+    static_assert(
+            GMX_GPU != GMX_GPU_NONE,
+            "GPU state propagator data object should only be constructed on the GPU code-paths.");
  
-    GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set.");
+    GMX_ASSERT(pmeStream->isValid(), "GPU PME stream should be valid.");
      pmeStream_      = pmeStream;
      localStream_    = pmeStream; // For clearing the force buffer
      nonLocalStream_ = nullptr;
@@ -256,8 +228,7 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<RVec>
  
      GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
  
-    GMX_ASSERT(deviceStream.stream() != nullptr,
-               "No stream is valid for copying with given atom locality.");
+    GMX_ASSERT(deviceStream.isValid(), "No stream is valid for copying with given atom locality.");
      wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
      wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
  
@@ -291,8 +262,7 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_dat
  
      GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
  
-    GMX_ASSERT(deviceStream.stream() != nullptr,
-               "No stream is valid for copying with given atom locality.");
+    GMX_ASSERT(deviceStream.isValid(), "No stream is valid for copying with given atom locality.");
      wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
      wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
  
@@ -546,14 +516,11 @@ int StatePropagatorDataGpu::Impl::numAtomsAll()
  }
  
  
-StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream*  pmeStream,
-                                               const DeviceStream*  localStream,
-                                               const DeviceStream*  nonLocalStream,
-                                               const DeviceContext& deviceContext,
-                                               GpuApiCallBehavior   transferKind,
-                                               int                  allocationBlockSizeDivisor,
-                                               gmx_wallcycle*       wcycle) :
-    impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, allocationBlockSizeDivisor, wcycle))
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStreamManager& deviceStreamManager,
+                                               GpuApiCallBehavior         transferKind,
+                                               int            allocationBlockSizeDivisor,
+                                               gmx_wallcycle* wcycle) :
+    impl_(new Impl(deviceStreamManager, transferKind, allocationBlockSizeDivisor, wcycle))
  {
  }
  
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index c015326e8dbee594583eda8bf566090b178c9c24..95aca5ba1d5dec5cf51faca94fa3ba4909791de4 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -366,7 +366,7 @@ static inline int calc_shmem_required_nonbonded(const int               num_thre
   */
  void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocality interactionLocality)
  {
-    const DeviceStream& deviceStream = nb->deviceStreams[interactionLocality];
+    const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
  
      /* When we get here all misc operations issued in the local stream as well as
         the local xq H2D are done,
@@ -405,7 +405,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
      cu_atomdata_t*      adat         = nb->atdat;
      cu_plist_t*         plist        = nb->plist[iloc];
      cu_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
  
      bool bDoTime = nb->bDoTime;
  
@@ -485,7 +485,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In
      cu_nbparam_t*       nbp          = nb->nbparam;
      cu_plist_t*         plist        = nb->plist[iloc];
      cu_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
  
      bool bDoTime = nb->bDoTime;
  
@@ -598,7 +598,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
      cu_nbparam_t*       nbp          = nb->nbparam;
      cu_plist_t*         plist        = nb->plist[iloc];
      cu_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
  
      bool bDoTime = nb->bDoTime;
  
@@ -732,7 +732,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
      cu_atomdata_t*      adat         = nb->atdat;
      cu_timers_t*        t            = nb->timers;
      bool                bDoTime      = nb->bDoTime;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
  
      /* don't launch non-local copy-back if there was no non-local work to do */
      if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
@@ -836,7 +836,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
      const int                  numAtomsPerCell = grid.numAtomsPerCell();
      Nbnxm::InteractionLocality interactionLoc  = gpuAtomToInteractionLocality(locality);
  
-    const DeviceStream& deviceStream = nb->deviceStreams[interactionLoc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[interactionLoc];
  
      int numAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
      // avoid empty kernel launch, skip to inserting stream dependency
@@ -901,7 +901,7 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLo
      GMX_ASSERT(totalForcesDevice, "Need a valid totalForcesDevice pointer");
  
      const InteractionLocality iLocality    = gpuAtomToInteractionLocality(atomLocality);
-    const DeviceStream&       deviceStream = nb->deviceStreams[iLocality];
+    const DeviceStream&       deviceStream = *nb->deviceStreams[iLocality];
      cu_atomdata_t*            adat         = nb->atdat;
  
      size_t gmx_used_in_debug numDependency = static_cast<size_t>((useGpuFPmeReduction == true))
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu

index a76880b17ee1484e0b48c2649b8b6b1ada46749c..6579d4100497e1784d483c4e942920e10f506a0d 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -51,7 +51,7 @@
  
  // TODO Remove this comment when the above order issue is resolved
  #include "gromacs/gpu_utils/cudautils.cuh"
-#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
  #include "gromacs/gpu_utils/pmalloc_cuda.h"
@@ -413,16 +413,16 @@ static void cuda_init_const(NbnxmGpu*                       nb,
      nbnxn_cuda_clear_e_fshift(nb);
  }
  
-NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
-                   const interaction_const_t* ic,
-                   const PairlistParams&      listParams,
-                   const nbnxn_atomdata_t*    nbat,
-                   bool                       bLocalAndNonlocal)
+NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
+                   const interaction_const_t*      ic,
+                   const PairlistParams&           listParams,
+                   const nbnxn_atomdata_t*         nbat,
+                   bool                            bLocalAndNonlocal)
  {
      cudaError_t stat;
  
      auto nb            = new NbnxmGpu();
-    nb->deviceContext_ = &deviceContext;
+    nb->deviceContext_ = &deviceStreamManager.context();
      snew(nb->atdat, 1);
      snew(nb->nbparam, 1);
      snew(nb->plist[InteractionLocality::Local], 1);
@@ -444,8 +444,10 @@ NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
      init_plist(nb->plist[InteractionLocality::Local]);
  
      /* local/non-local GPU streams */
-    nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceContext_,
-                                                       DeviceStreamPriority::Normal, nb->bDoTime);
+    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
+                       "Local non-bonded stream should be initialized to use GPU for non-bonded.");
+    nb->deviceStreams[InteractionLocality::Local] =
+            &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
      if (nb->bUseTwoStreams)
      {
          init_plist(nb->plist[InteractionLocality::NonLocal]);
@@ -454,8 +456,12 @@ NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
           * priorities, because we are querying the priority range which in this
           * case will be a single value.
           */
-        nb->deviceStreams[InteractionLocality::NonLocal].init(
-                *nb->deviceContext_, DeviceStreamPriority::High, nb->bDoTime);
+        GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
+                           "Non-local non-bonded stream should be initialized to use GPU for "
+                           "non-bonded with domain decomposition.");
+        nb->deviceStreams[InteractionLocality::NonLocal] =
+                &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
+        ;
      }
  
      /* init events for sychronization (timing disabled for performance reasons!) */
@@ -504,7 +510,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
  {
      char                sbuf[STRLEN];
      bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
      cu_plist_t*         d_plist      = nb->plist[iloc];
  
      if (d_plist->na_c < 0)
@@ -561,7 +567,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
  void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
  {
      cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local].stream();
+    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
  
      /* only if we have a dynamic box */
      if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
@@ -576,7 +582,7 @@ static void nbnxn_cuda_clear_f(NbnxmGpu* nb, int natoms_clear)
  {
      cudaError_t    stat;
      cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local].stream();
+    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
  
      stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
      CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
@@ -587,7 +593,7 @@ static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb)
  {
      cudaError_t    stat;
      cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local].stream();
+    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
  
      stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
      CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
@@ -616,7 +622,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
      bool                bDoTime      = nb->bDoTime;
      cu_timers_t*        timers       = nb->timers;
      cu_atomdata_t*      d_atdat      = nb->atdat;
-    const DeviceStream& deviceStream = nb->deviceStreams[InteractionLocality::Local];
+    const DeviceStream& deviceStream = *nb->deviceStreams[InteractionLocality::Local];
  
      natoms    = nbat->numAtoms();
      realloced = false;
@@ -806,13 +812,6 @@ gmx_bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
      return ((nb->nbparam->eeltype == eelCuEWALD_ANA) || (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
  }
  
-const DeviceStream* gpu_get_command_stream(NbnxmGpu* nb, const InteractionLocality iloc)
-{
-    assert(nb);
-
-    return &nb->deviceStreams[iloc];
-}
-
  void* gpu_get_xq(NbnxmGpu* nb)
  {
      assert(nb);
@@ -838,7 +837,7 @@ DeviceBuffer<gmx::RVec> gpu_get_fshift(NbnxmGpu* nb)
  /* TODO  Remove explicit pinning from host arrays from here and manage in a more natural way*/
  void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
  {
-    const DeviceStream& deviceStream  = gpu_nbv->deviceStreams[InteractionLocality::Local];
+    const DeviceStream& deviceStream  = *gpu_nbv->deviceStreams[InteractionLocality::Local];
      bool                bDoTime       = gpu_nbv->bDoTime;
      const int           maxNumColumns = gridSet.numColumnsMax();
  
@@ -929,7 +928,7 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int*                  cell,
                                      GpuEventSynchronizer* const localReductionDone)
  {
  
-    const DeviceStream& deviceStream = gpu_nbv->deviceStreams[InteractionLocality::Local];
+    const DeviceStream& deviceStream = *gpu_nbv->deviceStreams[InteractionLocality::Local];
  
      GMX_ASSERT(localReductionDone, "localReductionDone should be a valid pointer");
      gpu_nbv->localFReductionDone = localReductionDone;
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h

index de5241a5feec681eaf81e05b850f26a47d28b159..68d5da81c4b7bcad19f6e201dc41de7bd504648a 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -306,7 +306,7 @@ struct NbnxmGpu
      /*! \brief staging area where fshift/energies get downloaded */
      nb_staging_t nbst;
      /*! \brief local and non-local GPU streams */
-    gmx::EnumerationArray<Nbnxm::InteractionLocality, DeviceStream> deviceStreams;
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
  
      /*! \brief Events used for synchronization */
      /*! \{ */
diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h

index f5b3d813da09e7043e864a497d17994730aaa2cb..01f257e54f08fd787bdba775ceeeced859ef5ba1 100644 (file)
--- a/src/gromacs/nbnxm/gpu_common.h
+++ b/src/gromacs/nbnxm/gpu_common.h
@@ -399,7 +399,7 @@ bool gpu_try_finish_task(NbnxmGpu*                nb,
              // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
              wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
  
-            if (!haveStreamTasksCompleted(nb->deviceStreams[iLocality]))
+            if (!haveStreamTasksCompleted(*nb->deviceStreams[iLocality]))
              {
                  wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
  
@@ -412,7 +412,7 @@ bool gpu_try_finish_task(NbnxmGpu*                nb,
          }
          else if (haveResultToWaitFor)
          {
-            nb->deviceStreams[iLocality].synchronize();
+            nb->deviceStreams[iLocality]->synchronize();
          }
  
          // TODO: this needs to be moved later because conditional wait could brake timing
diff --git a/src/gromacs/nbnxm/gpu_data_mgmt.h b/src/gromacs/nbnxm/gpu_data_mgmt.h

index 8e114d1c65cb9affd4c5c9ee77f60dfd21c03168..21fc8174b56d494617b6e47ba5251563db467b05 100644 (file)
--- a/src/gromacs/nbnxm/gpu_data_mgmt.h
+++ b/src/gromacs/nbnxm/gpu_data_mgmt.h
@@ -50,9 +50,6 @@
  #include "gromacs/gpu_utils/gpu_macros.h"
  #include "gromacs/mdtypes/locality.h"
  
-class DeviceContext;
-class DeviceStream;
-
  struct NbnxmGpu;
  struct gmx_gpu_info_t;
  struct DeviceInformation;
@@ -62,12 +59,19 @@ struct NbnxnPairlistGpu;
  struct PairlistParams;
  struct interaction_const_t;
  
+class DeviceStream;
+
+namespace gmx
+{
+class DeviceStreamManager;
+}
+
  namespace Nbnxm
  {
  
  /** Initializes the data structures related to GPU nonbonded calculations. */
  GPU_FUNC_QUALIFIER
-NbnxmGpu* gpu_init(const DeviceContext gmx_unused& deviceContext,
+NbnxmGpu* gpu_init(const gmx::DeviceStreamManager gmx_unused& deviceStreamManager,
                     const interaction_const_t gmx_unused* ic,
                     const PairlistParams gmx_unused& listParams,
                     const nbnxn_atomdata_t gmx_unused* nbat,
diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h

index a15f646ed98eb38925e8c4671397701965d63700..d5ced753f7cc2b2f2d8dac8d3e4508a486b39d28 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm.h
+++ b/src/gromacs/nbnxm/nbnxm.h
@@ -120,7 +120,6 @@
  #include "gromacs/utility/enumerationhelpers.h"
  #include "gromacs/utility/real.h"
  
-class DeviceContext;
  struct DeviceInformation;
  struct gmx_domdec_zones_t;
  struct gmx_enerdata_t;
@@ -144,6 +143,7 @@ class GpuEventSynchronizer;
  
  namespace gmx
  {
+class DeviceStreamManager;
  class ForceWithShiftForces;
  class GpuBonded;
  template<typename>
@@ -404,16 +404,16 @@ namespace Nbnxm
  {
  
  /*! \brief Creates an Nbnxm object */
-std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlog,
-                                                   const t_inputrec*        ir,
-                                                   const t_forcerec*        fr,
-                                                   const t_commrec*         cr,
-                                                   const gmx_hw_info_t&     hardwareInfo,
-                                                   const DeviceInformation* deviceInfo,
-                                                   const DeviceContext*     deviceContext,
-                                                   const gmx_mtop_t*        mtop,
-                                                   matrix                   box,
-                                                   gmx_wallcycle*           wcycle);
+std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger& mdlog,
+                                                   const t_inputrec*    ir,
+                                                   const t_forcerec*    fr,
+                                                   const t_commrec*     cr,
+                                                   const gmx_hw_info_t& hardwareInfo,
+                                                   bool                 useGpuForNonbonded,
+                                                   const gmx::DeviceStreamManager* deviceStreamManager,
+                                                   const gmx_mtop_t*               mtop,
+                                                   matrix                          box,
+                                                   gmx_wallcycle*                  wcycle);
  
  } // namespace Nbnxm
  
diff --git a/src/gromacs/nbnxm/nbnxm_setup.cpp b/src/gromacs/nbnxm/nbnxm_setup.cpp

index d65c59c91d9f07ec1fa08d8271f7c385ce366df5..ea90c12b0c7e713c16777d492c0a0409a554a0e3 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_setup.cpp
+++ b/src/gromacs/nbnxm/nbnxm_setup.cpp
@@ -358,25 +358,24 @@ static int getMinimumIlistCountForGpuBalancing(NbnxmGpu* nbnxmGpu)
      return minimumIlistCount;
  }
  
-std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlog,
-                                                   const t_inputrec*        ir,
-                                                   const t_forcerec*        fr,
-                                                   const t_commrec*         cr,
-                                                   const gmx_hw_info_t&     hardwareInfo,
-                                                   const DeviceInformation* deviceInfo,
-                                                   const DeviceContext*     deviceContext,
-                                                   const gmx_mtop_t*        mtop,
-                                                   matrix                   box,
-                                                   gmx_wallcycle*           wcycle)
+std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger& mdlog,
+                                                   const t_inputrec*    ir,
+                                                   const t_forcerec*    fr,
+                                                   const t_commrec*     cr,
+                                                   const gmx_hw_info_t& hardwareInfo,
+                                                   const bool           useGpuForNonbonded,
+                                                   const gmx::DeviceStreamManager* deviceStreamManager,
+                                                   const gmx_mtop_t*               mtop,
+                                                   matrix                          box,
+                                                   gmx_wallcycle*                  wcycle)
  {
      const bool emulateGpu = (getenv("GMX_EMULATE_GPU") != nullptr);
-    const bool useGpu     = deviceInfo != nullptr;
  
-    GMX_RELEASE_ASSERT(!(emulateGpu && useGpu),
+    GMX_RELEASE_ASSERT(!(emulateGpu && useGpuForNonbonded),
                         "When GPU emulation is active, there cannot be a GPU assignment");
  
      NonbondedResource nonbondedResource;
-    if (useGpu)
+    if (useGpuForNonbonded)
      {
          nonbondedResource = NonbondedResource::Gpu;
      }
@@ -425,7 +424,8 @@ std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlo
          enbnxninitcombrule = enbnxninitcombruleNONE;
      }
  
-    auto pinPolicy = (useGpu ? gmx::PinningPolicy::PinnedIfSupported : gmx::PinningPolicy::CannotBePinned);
+    auto pinPolicy = (useGpuForNonbonded ? gmx::PinningPolicy::PinnedIfSupported
+                                         : gmx::PinningPolicy::CannotBePinned);
  
      auto nbat = std::make_unique<nbnxn_atomdata_t>(pinPolicy);
  
@@ -440,18 +440,18 @@ std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlo
      }
      nbnxn_atomdata_init(mdlog, nbat.get(), kernelSetup.kernelType, enbnxninitcombrule, fr->ntype,
                          fr->nbfp, mimimumNumEnergyGroupNonbonded,
-                        (useGpu || emulateGpu) ? 1 : gmx_omp_nthreads_get(emntNonbonded));
+                        (useGpuForNonbonded || emulateGpu) ? 1 : gmx_omp_nthreads_get(emntNonbonded));
  
      NbnxmGpu* gpu_nbv                          = nullptr;
      int       minimumIlistCountForGpuBalancing = 0;
-    if (useGpu)
+    if (useGpuForNonbonded)
      {
-        GMX_RELEASE_ASSERT(
-                deviceContext != nullptr,
-                "Device context can not be nullptr when to use GPU for non-bonded forces.");
          /* init the NxN GPU data; the last argument tells whether we'll have
           * both local and non-local NB calculation on GPU */
-        gpu_nbv = gpu_init(*deviceContext, fr->ic, pairlistParams, nbat.get(), haveMultipleDomains);
+        GMX_RELEASE_ASSERT(
+                (deviceStreamManager != nullptr),
+                "Device stream manager should be initialized in order to use GPU for non-bonded.");
+        gpu_nbv = gpu_init(*deviceStreamManager, fr->ic, pairlistParams, nbat.get(), haveMultipleDomains);
  
          minimumIlistCountForGpuBalancing = getMinimumIlistCountForGpuBalancing(gpu_nbv);
      }
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp

index ca6d9e4b197c80a515665cc2fffb171c9a69f9c2..a35a188400028bc6cedba05ea6044971cd57f947 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -487,7 +487,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
      cl_atomdata_t*      adat         = nb->atdat;
      cl_plist_t*         plist        = nb->plist[iloc];
      cl_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
  
      bool bDoTime = nb->bDoTime;
  
@@ -587,7 +587,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
      cl_nbparam_t*       nbp          = nb->nbparam;
      cl_plist_t*         plist        = nb->plist[iloc];
      cl_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
  
      bool bDoTime = nb->bDoTime;
  
@@ -725,7 +725,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
      cl_nbparam_t*       nbp          = nb->nbparam;
      cl_plist_t*         plist        = nb->plist[iloc];
      cl_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
      bool                bDoTime      = nb->bDoTime;
  
      if (plist->haveFreshList)
@@ -862,7 +862,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
      cl_atomdata_t*      adat         = nb->atdat;
      cl_timers_t*        t            = nb->timers;
      bool                bDoTime      = nb->bDoTime;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
  
      /* don't launch non-local copy-back if there was no non-local work to do */
      if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp

index f11aa2d807b357d9ca2b9ef56e9d764fa8b56421..bc913e0e24d6f2e45bf3e8b89c791f4aae1d7642 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -52,6 +52,7 @@
  
  #include <cmath>
  
+#include "gromacs/gpu_utils/device_stream_manager.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/gpu_utils/oclutils.h"
  #include "gromacs/hardware/gpu_hw_info.h"
@@ -485,7 +486,7 @@ static void nbnxn_ocl_clear_e_fshift(NbnxmGpu* nb)
  
      cl_int           cl_error;
      cl_atomdata_t*   adat = nb->atdat;
-    cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local].stream();
+    cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
  
      size_t local_work_size[3]  = { 1, 1, 1 };
      size_t global_work_size[3] = { 1, 1, 1 };
@@ -555,16 +556,16 @@ static void nbnxn_ocl_init_const(cl_atomdata_t*                  atomData,
  
  
  //! This function is documented in the header file
-NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
-                   const interaction_const_t* ic,
-                   const PairlistParams&      listParams,
-                   const nbnxn_atomdata_t*    nbat,
-                   const bool                 bLocalAndNonlocal)
+NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
+                   const interaction_const_t*      ic,
+                   const PairlistParams&           listParams,
+                   const nbnxn_atomdata_t*         nbat,
+                   const bool                      bLocalAndNonlocal)
  {
      GMX_ASSERT(ic, "Need a valid interaction constants object");
  
      auto nb            = new NbnxmGpu();
-    nb->deviceContext_ = &deviceContext;
+    nb->deviceContext_ = &deviceStreamManager.context();
      snew(nb->atdat, 1);
      snew(nb->nbparam, 1);
      snew(nb->plist[InteractionLocality::Local], 1);
@@ -578,6 +579,7 @@ NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
      nb->timers = new cl_timers_t();
      snew(nb->timings, 1);
  
+    /* set device info, just point it to the right GPU among the detected ones */
      nb->dev_rundata = new gmx_device_runtime_data_t();
  
      /* init nbst */
@@ -591,15 +593,20 @@ NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
      nb->bDoTime = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
  
      /* local/non-local GPU streams */
-    nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceContext_,
-                                                       DeviceStreamPriority::Normal, nb->bDoTime);
+    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
+                       "Local non-bonded stream should be initialized to use GPU for non-bonded.");
+    nb->deviceStreams[InteractionLocality::Local] =
+            &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
  
      if (nb->bUseTwoStreams)
      {
          init_plist(nb->plist[InteractionLocality::NonLocal]);
  
-        nb->deviceStreams[InteractionLocality::NonLocal].init(
-                *nb->deviceContext_, DeviceStreamPriority::High, nb->bDoTime);
+        GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
+                           "Non-local non-bonded stream should be initialized to use GPU for "
+                           "non-bonded with domain decomposition.");
+        nb->deviceStreams[InteractionLocality::NonLocal] =
+                &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
      }
  
      if (nb->bDoTime)
@@ -647,7 +654,7 @@ static void nbnxn_ocl_clear_f(NbnxmGpu* nb, int natoms_clear)
      cl_int gmx_used_in_debug cl_error;
  
      cl_atomdata_t*   atomData = nb->atdat;
-    cl_command_queue ls       = nb->deviceStreams[InteractionLocality::Local].stream();
+    cl_command_queue ls       = nb->deviceStreams[InteractionLocality::Local]->stream();
      cl_float         value    = 0.0F;
  
      cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float), 0,
@@ -669,7 +676,7 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
  
      /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
      cl_int gmx_unused cl_error;
-    cl_error = clFlush(nb->deviceStreams[InteractionLocality::Local].stream());
+    cl_error = clFlush(nb->deviceStreams[InteractionLocality::Local]->stream());
      GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
  }
  
@@ -681,7 +688,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
      // because getLastRangeTime() gets skipped with empty lists later
      // which leads to the counter not being reset.
      bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
      cl_plist_t*         d_plist      = nb->plist[iloc];
  
      if (d_plist->na_c < 0)
@@ -740,7 +747,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
  void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
  {
      cl_atomdata_t*   adat = nb->atdat;
-    cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local].stream();
+    cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
  
      /* only if we have a dynamic box */
      if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
@@ -760,7 +767,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
      bool                bDoTime      = nb->bDoTime;
      cl_timers_t*        timers       = nb->timers;
      cl_atomdata_t*      d_atdat      = nb->atdat;
-    const DeviceStream& deviceStream = nb->deviceStreams[InteractionLocality::Local];
+    const DeviceStream& deviceStream = *nb->deviceStreams[InteractionLocality::Local];
  
      natoms    = nbat->numAtoms();
      realloced = false;
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h

index 67029075239f5dc9e47535d0ef8c57961e3eb6dd..cda229478339145739f29b38b61b709e74a1f50f 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
@@ -364,7 +364,7 @@ struct NbnxmGpu
      nb_staging_t nbst;
  
      //! local and non-local GPU queues
-    gmx::EnumerationArray<Nbnxm::InteractionLocality, DeviceStream> deviceStreams;
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
  
      /*! \brief Events used for synchronization */
      /*! \{ */
author	Artem Zhmurov <zhmurov@gmail.com>
	Mon, 24 Feb 2020 09:22:40 +0000 (10:22 +0100)
committer	Artem Zhmurov <zhmurov@gmail.com>
	Wed, 25 Mar 2020 06:47:15 +0000 (07:47 +0100)
src/gromacs/domdec/domdec.cpp		patch \| blob \| history
src/gromacs/domdec/domdec.h		patch \| blob \| history
src/gromacs/ewald/pme.cpp		patch \| blob \| history
src/gromacs/ewald/pme.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu_types_host.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu_types_host_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_only.cpp		patch \| blob \| history
src/gromacs/ewald/pme_only.h		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu.h		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.h		patch \| blob \| history
src/gromacs/ewald/tests/CMakeLists.txt		patch \| blob \| history
src/gromacs/ewald/tests/pmegathertest.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmesolvetest.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmesplinespreadtest.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmetestcommon.cpp		patch \| blob \| history
src/gromacs/ewald/tests/pmetestcommon.h		patch \| blob \| history
src/gromacs/ewald/tests/testhardwarecontext.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/ewald/tests/testhardwarecontext.h	[new file with mode: 0644]	patch \| blob
src/gromacs/ewald/tests/testhardwarecontexts.cpp		patch \| blob \| history
src/gromacs/ewald/tests/testhardwarecontexts.h		patch \| blob \| history
src/gromacs/gpu_utils/device_stream.cpp		patch \| blob \| history
src/gromacs/gpu_utils/device_stream_manager.cpp		patch \| blob \| history
src/gromacs/gpu_utils/device_stream_manager.h		patch \| blob \| history
src/gromacs/gpu_utils/tests/device_stream_manager.cpp		patch \| blob \| history
src/gromacs/listed_forces/gpubonded_impl.cu		patch \| blob \| history
src/gromacs/mdlib/update_constrain_gpu_impl.cu		patch \| blob \| history
src/gromacs/mdrun/md.cpp		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history
src/gromacs/mdtypes/forcerec.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h		patch \| blob \| history
src/gromacs/nbnxm/gpu_common.h		patch \| blob \| history
src/gromacs/nbnxm/gpu_data_mgmt.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_setup.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h		patch \| blob \| history