Make use of the DeviceStreamManager
authorArtem Zhmurov <zhmurov@gmail.com>
Mon, 24 Feb 2020 09:22:40 +0000 (10:22 +0100)
committerArtem Zhmurov <zhmurov@gmail.com>
Wed, 25 Mar 2020 06:47:15 +0000 (07:47 +0100)
Use the DeviceStreamManager throughout the code. The manager is
owned by the runner and created when GPU is active. The consumers
get the context and streams if needed.

TODOs:
1. Make builders and move the selection on whether the stream should
   be created there. The builders should take the manager and pass
   the context and the stream to the consumer. Builders should have
   the option to create a stream.
2. Makefile in ewald tests uses old infrastructure. Also, the device
   context management should be lifted from there and utilized in
   all the tests that can run on GPU hardware.

Refs #3316
Refs #3311

Change-Id: I0d08adbe1dee19c1890e55f0e0cf79cea97d39bd

48 files changed:
src/gromacs/domdec/domdec.cpp
src/gromacs/domdec/domdec.h
src/gromacs/ewald/pme.cpp
src/gromacs/ewald/pme.h
src/gromacs/ewald/pme_gpu.cpp
src/gromacs/ewald/pme_gpu_internal.cpp
src/gromacs/ewald/pme_gpu_internal.h
src/gromacs/ewald/pme_gpu_types_host.h
src/gromacs/ewald/pme_gpu_types_host_impl.h
src/gromacs/ewald/pme_only.cpp
src/gromacs/ewald/pme_only.h
src/gromacs/ewald/pme_pp_comm_gpu.h
src/gromacs/ewald/pme_pp_comm_gpu_impl.cpp
src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
src/gromacs/ewald/pme_pp_comm_gpu_impl.h
src/gromacs/ewald/tests/CMakeLists.txt
src/gromacs/ewald/tests/pmegathertest.cpp
src/gromacs/ewald/tests/pmesolvetest.cpp
src/gromacs/ewald/tests/pmesplinespreadtest.cpp
src/gromacs/ewald/tests/pmetestcommon.cpp
src/gromacs/ewald/tests/pmetestcommon.h
src/gromacs/ewald/tests/testhardwarecontext.cpp [new file with mode: 0644]
src/gromacs/ewald/tests/testhardwarecontext.h [new file with mode: 0644]
src/gromacs/ewald/tests/testhardwarecontexts.cpp
src/gromacs/ewald/tests/testhardwarecontexts.h
src/gromacs/gpu_utils/device_stream.cpp
src/gromacs/gpu_utils/device_stream_manager.cpp
src/gromacs/gpu_utils/device_stream_manager.h
src/gromacs/gpu_utils/tests/device_stream_manager.cpp
src/gromacs/listed_forces/gpubonded_impl.cu
src/gromacs/mdlib/update_constrain_gpu_impl.cu
src/gromacs/mdrun/md.cpp
src/gromacs/mdrun/runner.cpp
src/gromacs/mdtypes/forcerec.h
src/gromacs/mdtypes/state_propagator_data_gpu.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
src/gromacs/nbnxm/gpu_common.h
src/gromacs/nbnxm/gpu_data_mgmt.h
src/gromacs/nbnxm/nbnxm.h
src/gromacs/nbnxm/nbnxm_setup.cpp
src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h

index ebcc92bf2ea1949d794860cc22f0db393771fefb..aed2af87cbe5522259c31d9311df501fa50cbae0 100644 (file)
@@ -64,6 +64,7 @@
 #include "gromacs/domdec/partition.h"
 #include "gromacs/gmxlib/network.h"
 #include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/hardware/hw_info.h"
 #include "gromacs/listed_forces/manage_threading.h"
@@ -3200,13 +3201,16 @@ gmx_bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const g
     return bCutoffAllowed;
 }
 
-void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
-                              const t_commrec&     cr,
-                              const DeviceContext& deviceContext,
-                              const DeviceStream&  streamLocal,
-                              const DeviceStream&  streamNonLocal)
+void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
+                              const t_commrec&                cr,
+                              const gmx::DeviceStreamManager& deviceStreamManager)
 {
-
+    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
+                       "Local non-bonded stream should be valid when using"
+                       "GPU halo exchange.");
+    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
+                       "Non-local non-bonded stream should be valid when using "
+                       "GPU halo exchange.");
     int gpuHaloExchangeSize = 0;
     int pulseStart          = 0;
     if (cr.dd->gpuHaloExchange.empty())
@@ -3228,7 +3232,9 @@ void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
         for (int pulse = pulseStart; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
         {
             cr.dd->gpuHaloExchange.push_back(std::make_unique<gmx::GpuHaloExchange>(
-                    cr.dd, cr.mpi_comm_mysim, deviceContext, streamLocal, streamNonLocal, pulse));
+                    cr.dd, cr.mpi_comm_mysim, deviceStreamManager.context(),
+                    deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
+                    deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal), pulse));
         }
     }
 }
index da617607a64a1acf500d3c83338e4da823beced7..82cefa18c1eafcd85dab35aae4f4fda876e61dfc 100644 (file)
@@ -84,13 +84,13 @@ struct t_mdatoms;
 struct t_nrnb;
 struct gmx_wallcycle;
 enum class PbcType : int;
-class DeviceStream;
 class t_state;
 class DeviceContext;
 class GpuEventSynchronizer;
 
 namespace gmx
 {
+class DeviceStreamManager;
 class ForceWithShiftForces;
 class MDLogger;
 class RangePartitioning;
@@ -313,18 +313,15 @@ void dd_bonded_cg_distance(const gmx::MDLogger& mdlog,
                            real*                r_2b,
                            real*                r_mb);
 
-/*! \brief Construct the GPU halo exchange object(s)
- * \param[in] mdlog          The logger object
- * \param[in] cr             The commrec object
- * \param[in] deviceContext  GPU device context
- * \param[in] streamLocal    The local GPU stream
- * \param[in] streamNonLocal The non-local GPU stream
+/*! \brief Construct the GPU halo exchange object(s).
+ *
+ * \param[in] mdlog               The logger object.
+ * \param[in] cr                  The commrec object.
+ * \param[in] deviceStreamManager Manager of the GPU context and streams.
  */
-void constructGpuHaloExchange(const gmx::MDLogger& mdlog,
-                              const t_commrec&     cr,
-                              const DeviceContext& deviceContext,
-                              const DeviceStream&  streamLocal,
-                              const DeviceStream&  streamNonLocal);
+void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
+                              const t_commrec&                cr,
+                              const gmx::DeviceStreamManager& deviceStreamManager);
 
 /*! \brief
  * (Re-) Initialization for GPU halo exchange
index 8d8bb673c22500f46c72600ed2ecfaa7ac5f887e..120887e8bbea3567ee059aea4ccb66fef4e966a2 100644 (file)
@@ -560,19 +560,20 @@ static int div_round_up(int enumerator, int denominator)
     return (enumerator + denominator - 1) / denominator;
 }
 
-gmx_pme_t* gmx_pme_init(const t_commrec*         cr,
-                        const NumPmeDomains&     numPmeDomains,
-                        const t_inputrec*        ir,
-                        gmx_bool                 bFreeEnergy_q,
-                        gmx_bool                 bFreeEnergy_lj,
-                        gmx_bool                 bReproducible,
-                        real                     ewaldcoeff_q,
-                        real                     ewaldcoeff_lj,
-                        int                      nthread,
-                        PmeRunMode               runMode,
-                        PmeGpu*                  pmeGpu,
-                        const DeviceInformation* deviceInfo,
-                        const PmeGpuProgram*     pmeGpuProgram,
+gmx_pme_t* gmx_pme_init(const t_commrec*     cr,
+                        const NumPmeDomains& numPmeDomains,
+                        const t_inputrec*    ir,
+                        gmx_bool             bFreeEnergy_q,
+                        gmx_bool             bFreeEnergy_lj,
+                        gmx_bool             bReproducible,
+                        real                 ewaldcoeff_q,
+                        real                 ewaldcoeff_lj,
+                        int                  nthread,
+                        PmeRunMode           runMode,
+                        PmeGpu*              pmeGpu,
+                        const DeviceContext* deviceContext,
+                        const DeviceStream*  deviceStream,
+                        const PmeGpuProgram* pmeGpuProgram,
                         const gmx::MDLogger& /*mdlog*/)
 {
     int  use_threads, sum_use_threads, i;
@@ -883,8 +884,13 @@ gmx_pme_t* gmx_pme_init(const t_commrec*         cr,
         {
             GMX_THROW(gmx::NotImplementedError(errorString));
         }
+        pme_gpu_reinit(pme.get(), deviceContext, deviceStream, pmeGpuProgram);
     }
-    pme_gpu_reinit(pme.get(), deviceInfo, pmeGpuProgram);
+    else
+    {
+        GMX_ASSERT(pme->gpu == nullptr, "Should not have PME GPU object when PME is on a CPU.");
+    }
+
 
     pme_init_all_work(&pme->solve_work, pme->nthread, pme->nkx);
 
@@ -925,7 +931,7 @@ void gmx_pme_reinit(struct gmx_pme_t** pmedata,
         NumPmeDomains numPmeDomains = { pme_src->nnodes_major, pme_src->nnodes_minor };
         *pmedata = gmx_pme_init(cr, numPmeDomains, &irc, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE,
                                 ewaldcoeff_q, ewaldcoeff_lj, pme_src->nthread, pme_src->runMode,
-                                pme_src->gpu, nullptr, nullptr, dummyLogger);
+                                pme_src->gpu, nullptr, nullptr, nullptr, dummyLogger);
         /* When running PME on the CPU not using domain decomposition,
          * the atom data is allocated once only in gmx_pme_(re)init().
          */
index bfc79b88e99bfa5ef037fdf64e519b6da385f539..59aa7c604d5eb8e26d801326dde106bc55eda910 100644 (file)
@@ -64,7 +64,6 @@ struct t_inputrec;
 struct t_nrnb;
 struct PmeGpu;
 struct gmx_wallclock_gpu_pme_t;
-struct DeviceInformation;
 struct gmx_enerdata_t;
 struct gmx_mtop_t;
 struct gmx_pme_t;
@@ -137,20 +136,21 @@ bool gmx_pme_check_restrictions(int  pme_order,
  * related things whose lifetime can/should exceed that of a task (or
  * perhaps task manager). See Redmine #2522.
  */
-gmx_pme_t* gmx_pme_init(const t_commrec*         cr,
-                        const NumPmeDomains&     numPmeDomains,
-                        const t_inputrec*        ir,
-                        gmx_bool                 bFreeEnergy_q,
-                        gmx_bool                 bFreeEnergy_lj,
-                        gmx_bool                 bReproducible,
-                        real                     ewaldcoeff_q,
-                        real                     ewaldcoeff_lj,
-                        int                      nthread,
-                        PmeRunMode               runMode,
-                        PmeGpu*                  pmeGpu,
-                        const DeviceInformation* deviceInfo,
-                        const PmeGpuProgram*     pmeGpuProgram,
-                        const gmx::MDLogger&     mdlog);
+gmx_pme_t* gmx_pme_init(const t_commrec*     cr,
+                        const NumPmeDomains& numPmeDomains,
+                        const t_inputrec*    ir,
+                        gmx_bool             bFreeEnergy_q,
+                        gmx_bool             bFreeEnergy_lj,
+                        gmx_bool             bReproducible,
+                        real                 ewaldcoeff_q,
+                        real                 ewaldcoeff_lj,
+                        int                  nthread,
+                        PmeRunMode           runMode,
+                        PmeGpu*              pmeGpu,
+                        const DeviceContext* deviceContext,
+                        const DeviceStream*  deviceStream,
+                        const PmeGpuProgram* pmeGpuProgram,
+                        const gmx::MDLogger& mdlog);
 
 /*! \brief As gmx_pme_init, but takes most settings, except the grid/Ewald coefficients, from
  * pme_src. This is only called when the PME cut-off/grid size changes.
@@ -433,13 +433,6 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_device_x(const gmx_pme_t*        GPU_FUNC_AR
 GPU_FUNC_QUALIFIER void* pme_gpu_get_device_f(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
         GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
-/*! \brief Returns the pointer to the GPU stream.
- *  \param[in] pme            The PME data structure.
- *  \returns                  Pointer to GPU stream object.
- */
-GPU_FUNC_QUALIFIER const DeviceStream* pme_gpu_get_device_stream(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
-        GPU_FUNC_TERM_WITH_RETURN(nullptr);
-
 /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
  * \param[in] pme            The PME data structure.
  * \returns                  Pointer to sychronizer
index 91596e77d483be31bca315568ae91da3671ea087..a5f54f004d1472fd15d7d2e5b5c3f29eb199c352 100644 (file)
@@ -433,15 +433,6 @@ void pme_gpu_set_device_x(const gmx_pme_t* pme, DeviceBuffer<gmx::RVec> d_x)
     pme_gpu_set_kernelparam_coordinates(pme->gpu, d_x);
 }
 
-const DeviceStream* pme_gpu_get_device_stream(const gmx_pme_t* pme)
-{
-    if (!pme || !pme_gpu_active(pme))
-    {
-        return nullptr;
-    }
-    return pme_gpu_get_stream(pme->gpu);
-}
-
 GpuEventSynchronizer* pme_gpu_get_f_ready_synchronizer(const gmx_pme_t* pme)
 {
     if (!pme || !pme_gpu_active(pme))
index c7a6df563a9bf317c6a26ff4b06a9705501606e1..62f7cc5c7e26a1d7fa29fe99f5b164ca712b1d30 100644 (file)
@@ -56,6 +56,8 @@
 #include <string>
 
 #include "gromacs/ewald/ewald_utils.h"
+#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/math/invertmatrix.h"
 #include "gromacs/math/units.h"
@@ -462,16 +464,22 @@ void pme_gpu_sync_spread_grid(const PmeGpu* pmeGpu)
     pmeGpu->archSpecific->syncSpreadGridD2H.waitForEvent();
 }
 
-void pme_gpu_init_internal(PmeGpu* pmeGpu)
+/*! \brief Internal GPU initialization for PME.
+ *
+ * \param[in]  pmeGpu         GPU PME data.
+ * \param[in]  deviceContext  GPU context.
+ * \param[in]  deviceStream   GPU stream.
+ */
+static void pme_gpu_init_internal(PmeGpu* pmeGpu, const DeviceContext& deviceContext, const DeviceStream& deviceStream)
 {
 #if GMX_GPU == GMX_GPU_CUDA
     // Prepare to use the device that this PME task was assigned earlier.
     // Other entities, such as CUDA timing events, are known to implicitly use the device context.
-    CU_RET_ERR(cudaSetDevice(pmeGpu->deviceInfo->id), "Switching to PME CUDA device");
+    CU_RET_ERR(cudaSetDevice(deviceContext.deviceInfo().id), "Switching to PME CUDA device");
 #endif
 
     /* Allocate the target-specific structures */
-    pmeGpu->archSpecific.reset(new PmeGpuSpecific(pmeGpu->programHandle_->impl_->deviceContext_));
+    pmeGpu->archSpecific.reset(new PmeGpuSpecific(deviceContext, deviceStream));
     pmeGpu->kernelParams.reset(new PmeGpuKernelParams());
 
     pmeGpu->archSpecific->performOutOfPlaceFFT = true;
@@ -480,33 +488,12 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu)
      * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
      */
 
-    // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?) and reuse in NB
-    if (GMX_GPU == GMX_GPU_CUDA)
-    {
-        /* WARNING: CUDA timings are incorrect with multiple streams.
-         *          This is the main reason why they are disabled by default.
-         */
-        // TODO: Consider turning on by default when we can detect nr of streams.
-        pmeGpu->archSpecific->useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
-    }
-    else if (GMX_GPU == GMX_GPU_OPENCL)
-    {
-        pmeGpu->archSpecific->useTiming = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
-    }
-
 #if GMX_GPU == GMX_GPU_CUDA
-    pmeGpu->maxGridWidthX = pmeGpu->deviceInfo->prop.maxGridSize[0];
+    pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0];
 #elif GMX_GPU == GMX_GPU_OPENCL
     pmeGpu->maxGridWidthX = INT32_MAX / 2;
     // TODO: is there no really global work size limit in OpenCL?
 #endif
-
-    /* Creating a PME GPU stream:
-     * - default high priority with CUDA
-     * - no priorities implemented yet with OpenCL; see #2532
-     */
-    pmeGpu->archSpecific->pmeStream_.init(pmeGpu->archSpecific->deviceContext_,
-                                          DeviceStreamPriority::High, pmeGpu->archSpecific->useTiming);
 }
 
 void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu)
@@ -721,13 +708,15 @@ static void pme_gpu_select_best_performing_pme_spreadgather_kernels(PmeGpu* pmeG
  * TODO: this should become PmeGpu::PmeGpu()
  *
  * \param[in,out] pme            The PME structure.
- * \param[in,out] deviceInfo     The GPU device information structure.
- * \param[in]     pmeGpuProgram  The handle to the program/kernel data created outside (e.g. in unit tests/runner)
+ * \param[in]     deviceContext  The GPU context.
+ * \param[in]     deviceStream   The GPU stream.
+ * \param[in,out] pmeGpuProgram  The handle to the program/kernel data created outside (e.g. in unit tests/runner)
  */
-static void pme_gpu_init(gmx_pme_t* pme, const DeviceInformation* deviceInfo, const PmeGpuProgram* pmeGpuProgram)
+static void pme_gpu_init(gmx_pme_t*           pme,
+                         const DeviceContext& deviceContext,
+                         const DeviceStream&  deviceStream,
+                         const PmeGpuProgram* pmeGpuProgram)
 {
-    GMX_ASSERT(deviceInfo != nullptr,
-               "Device information can not be nullptr when GPU is used for PME.");
     pme->gpu       = new PmeGpu();
     PmeGpu* pmeGpu = pme->gpu;
     changePinningPolicy(&pmeGpu->staging.h_forces, pme_get_pinning_policy());
@@ -743,13 +732,12 @@ static void pme_gpu_init(gmx_pme_t* pme, const DeviceInformation* deviceInfo, co
 
     pme_gpu_set_testing(pmeGpu, false);
 
-    pmeGpu->deviceInfo = deviceInfo;
     GMX_ASSERT(pmeGpuProgram != nullptr, "GPU kernels must be already compiled");
     pmeGpu->programHandle_ = pmeGpuProgram;
 
     pmeGpu->initializedClfftLibrary_ = std::make_unique<gmx::ClfftInitializer>();
 
-    pme_gpu_init_internal(pmeGpu);
+    pme_gpu_init_internal(pmeGpu, deviceContext, deviceStream);
     pme_gpu_alloc_energy_virial(pmeGpu);
 
     pme_gpu_copy_common_data_from(pme);
@@ -773,19 +761,21 @@ void pme_gpu_get_real_grid_sizes(const PmeGpu* pmeGpu, gmx::IVec* gridSize, gmx:
     }
 }
 
-void pme_gpu_reinit(gmx_pme_t* pme, const DeviceInformation* deviceInfo, const PmeGpuProgram* pmeGpuProgram)
+void pme_gpu_reinit(gmx_pme_t*           pme,
+                    const DeviceContext* deviceContext,
+                    const DeviceStream*  deviceStream,
+                    const PmeGpuProgram* pmeGpuProgram)
 {
     GMX_ASSERT(pme != nullptr, "Need valid PME object");
-    if (pme->runMode == PmeRunMode::CPU)
-    {
-        GMX_ASSERT(pme->gpu == nullptr, "Should not have PME GPU object");
-        return;
-    }
 
     if (!pme->gpu)
     {
+        GMX_RELEASE_ASSERT(deviceContext != nullptr,
+                           "Device context can not be nullptr when setting up PME on GPU.");
+        GMX_RELEASE_ASSERT(deviceStream != nullptr,
+                           "Device stream can not be nullptr when setting up PME on GPU.");
         /* First-time initialization */
-        pme_gpu_init(pme, deviceInfo, pmeGpuProgram);
+        pme_gpu_init(pme, *deviceContext, *deviceStream, pmeGpuProgram);
     }
     else
     {
@@ -1358,18 +1348,6 @@ void pme_gpu_set_kernelparam_coordinates(const PmeGpu* pmeGpu, DeviceBuffer<gmx:
     pmeGpu->kernelParams->atoms.d_coordinates = d_x;
 }
 
-const DeviceStream* pme_gpu_get_stream(const PmeGpu* pmeGpu)
-{
-    if (pmeGpu)
-    {
-        return &pmeGpu->archSpecific->pmeStream_;
-    }
-    else
-    {
-        return nullptr;
-    }
-}
-
 GpuEventSynchronizer* pme_gpu_get_forces_ready_synchronizer(const PmeGpu* pmeGpu)
 {
     if (pmeGpu && pmeGpu->kernelParams)
index 3d764fd468a57448424199574a50a076d5aa60da..9a15c3bbc986f0b0141a84dc61db2ac1fd11f3cf 100644 (file)
 #include "pme_gpu_types_host.h"
 #include "pme_output.h"
 
-class GpuEventSynchronizer;
+class DeviceContext;
 struct DeviceInformation;
+class DeviceStream;
+class GpuEventSynchronizer;
 struct gmx_hw_info_t;
 struct gmx_gpu_opt_t;
 struct gmx_pme_t; // only used in pme_gpu_reinit
@@ -71,7 +73,7 @@ struct t_complex;
 namespace gmx
 {
 class MDLogger;
-}
+} // namespace gmx
 
 //! Type of spline data
 enum class PmeSplineDataType
@@ -299,14 +301,6 @@ void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu);
  */
 void pme_gpu_sync_spread_grid(const PmeGpu* pmeGpu);
 
-/*! \libinternal \brief
- * Does the one-time GPU-framework specific PME initialization.
- * For CUDA, the PME stream is created with the highest priority.
- *
- * \param[in] pmeGpu  The PME GPU structure.
- */
-void pme_gpu_init_internal(PmeGpu* pmeGpu);
-
 /*! \libinternal \brief
  * Initializes the CUDA FFT structures.
  *
@@ -387,13 +381,6 @@ GPU_FUNC_QUALIFIER void pme_gpu_set_kernelparam_coordinates(const PmeGpu* GPU_FU
 GPU_FUNC_QUALIFIER void* pme_gpu_get_kernelparam_forces(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
         GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
-/*! \brief Return pointer to GPU stream.
- * \param[in] pmeGpu         The PME GPU structure.
- * \returns                  Pointer to stream object.
- */
-GPU_FUNC_QUALIFIER const DeviceStream* pme_gpu_get_stream(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
-        GPU_FUNC_TERM_WITH_RETURN(nullptr);
-
 /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
  * \param[in] pmeGpu         The PME GPU structure.
  * \returns                  Pointer to sync object
@@ -498,13 +485,16 @@ GPU_FUNC_QUALIFIER void pme_gpu_get_real_grid_sizes(const PmeGpu* GPU_FUNC_ARGUM
 /*! \libinternal \brief
  * (Re-)initializes the PME GPU data at the beginning of the run or on DLB.
  *
- * \param[in,out] pme             The PME structure.
- * \param[in]     deviceInfo      The GPU device information structure.
- * \param[in]     pmeGpuProgram   The PME GPU program data
+ * \param[in,out] pme            The PME structure.
+ * \param[in]     deviceContext  The GPU context.
+ * \param[in]     deviceStream   The GPU stream.
+ * \param[in,out] pmeGpuProgram  The handle to the program/kernel data created outside (e.g. in unit tests/runner)
+ *
  * \throws gmx::NotImplementedError if this generally valid PME structure is not valid for GPU runs.
  */
-GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t*               GPU_FUNC_ARGUMENT(pme),
-                                       const DeviceInformation* GPU_FUNC_ARGUMENT(deviceInfo),
+GPU_FUNC_QUALIFIER void pme_gpu_reinit(gmx_pme_t*           GPU_FUNC_ARGUMENT(pme),
+                                       const DeviceContext* GPU_FUNC_ARGUMENT(deviceContext),
+                                       const DeviceStream*  GPU_FUNC_ARGUMENT(deviceStream),
                                        const PmeGpuProgram* GPU_FUNC_ARGUMENT(pmeGpuProgram)) GPU_FUNC_TERM;
 
 /*! \libinternal \brief
index 481c3a302d8151f53f3b613099ff5cfe8637bf52..9d7e2f78f9a842acd0f7b5c6bf5d8268ecf407fa 100644 (file)
@@ -162,9 +162,6 @@ struct PmeGpu
      */
     int nAtomsAlloc;
 
-    /*! \brief A pointer to the device used during the execution. */
-    const DeviceInformation* deviceInfo;
-
     /*! \brief Kernel scheduling grid width limit in X - derived from deviceinfo compute capability in CUDA.
      * Declared as very large int to make it useful in computations with type promotion, to avoid overflows.
      * OpenCL seems to not have readily available global work size limit, so we just assign a large arbitrary constant to this instead.
index a019a7c0310b599ba6dda7c2e4fac04b65355f43..e134d2c0a79d306d4a6793c24c5ba6bfe2767b63 100644 (file)
@@ -71,9 +71,14 @@ struct PmeGpuSpecific
 {
     /*! \brief Constructor
      *
-     * \param[in] deviceContext GPU device context.
+     * \param[in] deviceContext  GPU device context
+     * \param[in] pmeStream      GPU pme stream.
      */
-    PmeGpuSpecific(const DeviceContext& deviceContext) : deviceContext_(deviceContext) {}
+    PmeGpuSpecific(const DeviceContext& deviceContext, const DeviceStream& pmeStream) :
+        deviceContext_(deviceContext),
+        pmeStream_(pmeStream)
+    {
+    }
 
     /*! \brief
      * A handle to the GPU context.
@@ -84,7 +89,7 @@ struct PmeGpuSpecific
     const DeviceContext& deviceContext_;
 
     /*! \brief The GPU stream where everything related to the PME happens. */
-    DeviceStream pmeStream_;
+    const DeviceStream& pmeStream_;
 
     /* Synchronization events */
     /*! \brief Triggered after the PME Force Calculations have been completed */
index b85629b28af6ef4a630573a5b37466e4b1ac062f..26d824e3a2adfa66e18d36fb3e65b312b84ef92f 100644 (file)
@@ -82,6 +82,7 @@
 #include "gromacs/fileio/pdbio.h"
 #include "gromacs/gmxlib/network.h"
 #include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
 #include "gromacs/gpu_utils/hostallocator.h"
 #include "gromacs/math/gmxcomplex.h"
 #include "gromacs/math/units.h"
@@ -597,14 +598,14 @@ static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
 #endif
 }
 
-int gmx_pmeonly(struct gmx_pme_t*         pme,
-                const t_commrec*          cr,
-                t_nrnb*                   mynrnb,
-                gmx_wallcycle*            wcycle,
-                gmx_walltime_accounting_t walltime_accounting,
-                t_inputrec*               ir,
-                PmeRunMode                runMode,
-                const DeviceContext*      deviceContext)
+int gmx_pmeonly(struct gmx_pme_t*               pme,
+                const t_commrec*                cr,
+                t_nrnb*                         mynrnb,
+                gmx_wallcycle*                  wcycle,
+                gmx_walltime_accounting_t       walltime_accounting,
+                t_inputrec*                     ir,
+                PmeRunMode                      runMode,
+                const gmx::DeviceStreamManager* deviceStreamManager)
 {
     int     ret;
     int     natoms = 0;
@@ -629,25 +630,27 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
     const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed);
     if (useGpuForPme)
     {
-        const DeviceStream& deviceStream = *pme_gpu_get_device_stream(pme);
-
+        GMX_RELEASE_ASSERT(
+                deviceStreamManager != nullptr,
+                "Device stream manager can not be nullptr when using GPU in PME-only rank.");
+        GMX_RELEASE_ASSERT(deviceStreamManager->streamIsValid(gmx::DeviceStreamType::Pme),
+                           "Device stream can not be nullptr when using GPU in PME-only rank");
         changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
         changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
         if (c_enableGpuPmePpComms)
         {
             pme_pp->pmeCoordinateReceiverGpu = std::make_unique<gmx::PmeCoordinateReceiverGpu>(
-                    deviceStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
+                    deviceStreamManager->stream(gmx::DeviceStreamType::Pme), pme_pp->mpi_comm_mysim,
+                    pme_pp->ppRanks);
             pme_pp->pmeForceSenderGpu = std::make_unique<gmx::PmeForceSenderGpu>(
-                    deviceStream, pme_pp->mpi_comm_mysim, pme_pp->ppRanks);
+                    deviceStreamManager->stream(gmx::DeviceStreamType::Pme), pme_pp->mpi_comm_mysim,
+                    pme_pp->ppRanks);
         }
-        GMX_RELEASE_ASSERT(
-                deviceContext != nullptr,
-                "Device context can not be nullptr when building GPU propagator data object.");
         // TODO: Special PME-only constructor is used here. There is no mechanism to prevent from using the other constructor here.
         //       This should be made safer.
-        stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(&deviceStream, *deviceContext,
-                                                                 GpuApiCallBehavior::Async,
-                                                                 pme_gpu_get_block_size(pme), wcycle);
+        stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
+                &deviceStreamManager->stream(gmx::DeviceStreamType::Pme), deviceStreamManager->context(),
+                GpuApiCallBehavior::Async, pme_gpu_get_block_size(pme), wcycle);
     }
 
     clear_nrnb(mynrnb);
index 18edbb9b43ddf5bcdbe23127f58a2e79ece71027..1a71ea195c1b74ad44d168bd20753cfb0706e4d4 100644 (file)
@@ -55,17 +55,20 @@ struct t_nrnb;
 struct gmx_pme_t;
 struct gmx_wallcycle;
 
-class DeviceContext;
 enum class PmeRunMode;
+namespace gmx
+{
+class DeviceStreamManager;
+}
 
 /*! \brief Called on the nodes that do PME exclusively */
-int gmx_pmeonly(gmx_pme_t*                pme,
-                const t_commrec*          cr,
-                t_nrnb*                   mynrnb,
-                gmx_wallcycle*            wcycle,
-                gmx_walltime_accounting_t walltime_accounting,
-                t_inputrec*               ir,
-                PmeRunMode                runMode,
-                const DeviceContext*      deviceContext);
+int gmx_pmeonly(gmx_pme_t*                      pme,
+                const t_commrec*                cr,
+                t_nrnb*                         mynrnb,
+                gmx_wallcycle*                  wcycle,
+                gmx_walltime_accounting_t       walltime_accounting,
+                t_inputrec*                     ir,
+                PmeRunMode                      runMode,
+                const gmx::DeviceStreamManager* deviceStreamManager);
 
 #endif
index ea750cc17c1052fb95a639dd2512da2d30b905e8..97accca871d2b27da7d8e28235871a3c9b8877e1 100644 (file)
 #include "gromacs/utility/gmxmpi.h"
 
 class DeviceContext;
+class DeviceStream;
 class GpuEventSynchronizer;
 
 namespace gmx
 {
 
+class DeviceStreamManager;
+
 /*! \libinternal
 
  * \brief Manages communication related to GPU buffers between this
@@ -63,8 +66,9 @@ public:
      * \param[in] comm            Communicator used for simulation
      * \param[in] pmeRank         Rank of PME task
      * \param[in] deviceContext   GPU context.
+     * \param[in] deviceStream    GPU stream.
      */
-    PmePpCommGpu(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext);
+    PmePpCommGpu(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext, const DeviceStream& deviceStream);
     ~PmePpCommGpu();
 
     /*! \brief Perform steps required when buffer size changes
index b8befc5311033680740c329c99d588b823226383..b53ce94ada0b13236bd22d2c2ed8180162665f36 100644 (file)
@@ -62,7 +62,10 @@ class PmePpCommGpu::Impl
 };
 
 /*!\brief Constructor stub. */
-PmePpCommGpu::PmePpCommGpu(MPI_Comm /* comm */, int /* pmeRank */, const DeviceContext& /* deviceContext */) :
+PmePpCommGpu::PmePpCommGpu(MPI_Comm /* comm */,
+                           int /* pmeRank */,
+                           const DeviceContext& /* deviceContext */,
+                           const DeviceStream& /* deviceStream */) :
     impl_(nullptr)
 {
     GMX_ASSERT(false,
index 0e78978865bb4866fead00907c5d2272a5f0f80e..2c6f696ddd41a9d754c9ebab550db021f8d528c0 100644 (file)
@@ -49,6 +49,7 @@
 
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/utility/gmxmpi.h"
 namespace gmx
 {
 
-PmePpCommGpu::Impl::Impl(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext) :
+PmePpCommGpu::Impl::Impl(MPI_Comm             comm,
+                         int                  pmeRank,
+                         const DeviceContext& deviceContext,
+                         const DeviceStream&  deviceStream) :
     deviceContext_(deviceContext),
+    pmePpCommStream_(deviceStream),
     comm_(comm),
     pmeRank_(pmeRank)
 {
     GMX_RELEASE_ASSERT(
             GMX_THREAD_MPI,
             "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
-
-    // In CUDA we only need priority to create stream.
-    // (note that this will be moved from here in the follow-up patch)
-    pmePpCommStream_.init(deviceContext, DeviceStreamPriority::Normal, false);
 }
 
 PmePpCommGpu::Impl::~Impl() = default;
@@ -158,8 +159,11 @@ void* PmePpCommGpu::Impl::getForcesReadySynchronizer()
     return static_cast<void*>(&forcesReadySynchronizer_);
 }
 
-PmePpCommGpu::PmePpCommGpu(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext) :
-    impl_(new Impl(comm, pmeRank, deviceContext))
+PmePpCommGpu::PmePpCommGpu(MPI_Comm             comm,
+                           int                  pmeRank,
+                           const DeviceContext& deviceContext,
+                           const DeviceStream&  deviceStream) :
+    impl_(new Impl(comm, pmeRank, deviceContext, deviceStream))
 {
 }
 
index 934b7c40c62ed1a1daa3eb58f607cb494554d3dd..4c95d9bccdef21745a470389f0043376d8aaf428 100644 (file)
@@ -57,11 +57,13 @@ class PmePpCommGpu::Impl
 
 public:
     /*! \brief Creates PME-PP GPU communication object.
+     *
      * \param[in] comm            Communicator used for simulation
      * \param[in] pmeRank         Rank of PME task
      * \param[in] deviceContext   GPU context.
+     * \param[in] deviceStream    GPU stream.
      */
-    Impl(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext);
+    Impl(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext, const DeviceStream& deviceStream);
     ~Impl();
 
     /*! \brief Perform steps required when buffer size changes
@@ -116,10 +118,10 @@ public:
     void* getForcesReadySynchronizer();
 
 private:
-    //! Device context object
+    //! GPU context handle (not used in CUDA)
     const DeviceContext& deviceContext_;
-    //! CUDA stream used for the communication operations in this class
-    DeviceStream pmePpCommStream_;
+    //! Handle for CUDA stream used for the communication operations in this class
+    const DeviceStream& pmePpCommStream_;
     //! Remote location of PME coordinate data buffer
     void* remotePmeXBuffer_ = nullptr;
     //! Remote location of PME force data buffer
index 6dea2b21ecca50c3a1ffc77a158c56729fa342d2..170bc96c120fee31877c8adb89504ee8660f9c7c 100644 (file)
@@ -41,4 +41,13 @@ gmx_add_unit_test(EwaldUnitTests ewald-test HARDWARE_DETECTION
         testhardwarecontexts.cpp
     GPU_CPP_SOURCE_FILES
         pmetestcommon.cpp
-        )
+)
+
+gmx_add_libgromacs_sources(
+    testhardwarecontext.cpp
+)
+if (GMX_USE_CUDA)
+gmx_compile_cpp_as_cuda(
+    testhardwarecontext.cpp
+)
+endif()
index 51035f035576a8a031396a88d7a8506583ef0272..eae6448323fe323e9d1eef384c727752258ffbd1 100644 (file)
@@ -280,13 +280,13 @@ public:
         TestReferenceData refData;
         for (const auto& context : getPmeTestEnv()->getHardwareContexts())
         {
-            CodePath   codePath = context->getCodePath();
+            CodePath   codePath = context->codePath();
             const bool supportedInput =
                     pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
             if (!supportedInput)
             {
                 /* Testing the failure for the unsupported input */
-                EXPECT_THROW_GMX(pmeInitWrapper(&inputRec, codePath, nullptr, nullptr, box),
+                EXPECT_THROW_GMX(pmeInitWrapper(&inputRec, codePath, nullptr, nullptr, nullptr, box),
                                  NotImplementedError);
                 continue;
             }
@@ -295,14 +295,16 @@ public:
             SCOPED_TRACE(
                     formatString("Testing force gathering with %s %sfor PME grid size %d %d %d"
                                  ", order %d, %zu atoms",
-                                 codePathToString(codePath), context->getDescription().c_str(),
+                                 codePathToString(codePath), context->description().c_str(),
                                  gridSize[XX], gridSize[YY], gridSize[ZZ], pmeOrder, atomCount));
 
-            PmeSafePointer pmeSafe = pmeInitWrapper(&inputRec, codePath, context->getDeviceInfo(),
-                                                    context->getPmeGpuProgram(), box);
+            PmeSafePointer pmeSafe =
+                    pmeInitWrapper(&inputRec, codePath, context->deviceContext(),
+                                   context->deviceStream(), context->pmeGpuProgram(), box);
             std::unique_ptr<StatePropagatorDataGpu> stateGpu =
                     (codePath == CodePath::GPU)
-                            ? makeStatePropagatorDataGpu(*pmeSafe.get(), context->deviceContext())
+                            ? makeStatePropagatorDataGpu(*pmeSafe.get(), context->deviceContext(),
+                                                         context->deviceStream())
                             : nullptr;
 
             pmeInitAtoms(pmeSafe.get(), stateGpu.get(), codePath, inputAtomData.coordinates,
index 86688822ebcc8a1f70b980fab2d039e30d32730f..07f31629f6317c136379dd135a59135675946af7 100644 (file)
@@ -109,13 +109,13 @@ public:
         TestReferenceData refData;
         for (const auto& context : getPmeTestEnv()->getHardwareContexts())
         {
-            CodePath   codePath = context->getCodePath();
+            CodePath   codePath = context->codePath();
             const bool supportedInput =
                     pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
             if (!supportedInput)
             {
                 /* Testing the failure for the unsupported input */
-                EXPECT_THROW_GMX(pmeInitEmpty(&inputRec, codePath, nullptr, nullptr, box,
+                EXPECT_THROW_GMX(pmeInitEmpty(&inputRec, codePath, nullptr, nullptr, nullptr, box,
                                               ewaldCoeff_q, ewaldCoeff_lj),
                                  NotImplementedError);
                 continue;
@@ -137,13 +137,13 @@ public:
                             "size %d %d %d, Ewald coefficients %g %g",
                             (method == PmeSolveAlgorithm::LennardJones) ? "Lennard-Jones" : "Coulomb",
                             gridOrdering.second.c_str(), computeEnergyAndVirial ? "with" : "without",
-                            codePathToString(codePath), context->getDescription().c_str(),
+                            codePathToString(codePath), context->description().c_str(),
                             gridSize[XX], gridSize[YY], gridSize[ZZ], ewaldCoeff_q, ewaldCoeff_lj));
 
                     /* Running the test */
-                    PmeSafePointer pmeSafe =
-                            pmeInitEmpty(&inputRec, codePath, context->getDeviceInfo(),
-                                         context->getPmeGpuProgram(), box, ewaldCoeff_q, ewaldCoeff_lj);
+                    PmeSafePointer pmeSafe = pmeInitEmpty(
+                            &inputRec, codePath, context->deviceContext(), context->deviceStream(),
+                            context->pmeGpuProgram(), box, ewaldCoeff_q, ewaldCoeff_lj);
                     pmeSetComplexGrid(pmeSafe.get(), codePath, gridOrdering.first, nonZeroGridValues);
                     const real cellVolume = box[0] * box[4] * box[8];
                     // FIXME - this is box[XX][XX] * box[YY][YY] * box[ZZ][ZZ], should be stored in the PME structure
index 5c2d8663ef593f5b52c2973cc45fb8bb058389a2..ef975b2d084389d1a7fe7240bacb55227440710d 100644 (file)
@@ -126,13 +126,13 @@ public:
 
         for (const auto& context : getPmeTestEnv()->getHardwareContexts())
         {
-            CodePath   codePath = context->getCodePath();
+            CodePath   codePath = context->codePath();
             const bool supportedInput =
                     pmeSupportsInputForMode(*getPmeTestEnv()->hwinfo(), &inputRec, codePath);
             if (!supportedInput)
             {
                 /* Testing the failure for the unsupported input */
-                EXPECT_THROW_GMX(pmeInitWrapper(&inputRec, codePath, nullptr, nullptr, box),
+                EXPECT_THROW_GMX(pmeInitWrapper(&inputRec, codePath, nullptr, nullptr, nullptr, box),
                                  NotImplementedError);
                 continue;
             }
@@ -141,20 +141,21 @@ public:
             {
                 /* Describing the test uniquely in case it fails */
 
-                SCOPED_TRACE(
-                        formatString("Testing %s with %s %sfor PME grid size %d %d %d"
-                                     ", order %d, %zu atoms",
-                                     option.second.c_str(), codePathToString(codePath),
-                                     context->getDescription().c_str(), gridSize[XX], gridSize[YY],
-                                     gridSize[ZZ], pmeOrder, atomCount));
+                SCOPED_TRACE(formatString(
+                        "Testing %s with %s %sfor PME grid size %d %d %d"
+                        ", order %d, %zu atoms",
+                        option.second.c_str(), codePathToString(codePath), context->description().c_str(),
+                        gridSize[XX], gridSize[YY], gridSize[ZZ], pmeOrder, atomCount));
 
                 /* Running the test */
 
-                PmeSafePointer pmeSafe = pmeInitWrapper(&inputRec, codePath, context->getDeviceInfo(),
-                                                        context->getPmeGpuProgram(), box);
+                PmeSafePointer pmeSafe =
+                        pmeInitWrapper(&inputRec, codePath, context->deviceContext(),
+                                       context->deviceStream(), context->pmeGpuProgram(), box);
                 std::unique_ptr<StatePropagatorDataGpu> stateGpu =
                         (codePath == CodePath::GPU)
-                                ? makeStatePropagatorDataGpu(*pmeSafe.get(), context->deviceContext())
+                                ? makeStatePropagatorDataGpu(*pmeSafe.get(), context->deviceContext(),
+                                                             context->deviceStream())
                                 : nullptr;
 
                 pmeInitAtoms(pmeSafe.get(), stateGpu.get(), codePath, coordinates, charges);
index eaf697e1d5fd8d82ca7faaa39e8a32db4ab5ed67..888cac58734ead34abf88a1606470f1900d0c893 100644 (file)
@@ -59,6 +59,7 @@
 #include "gromacs/ewald/pme_solve.h"
 #include "gromacs/ewald/pme_spread.h"
 #include "gromacs/fft/parallel_3dfft.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/math/invertmatrix.h"
 #include "gromacs/mdtypes/commrec.h"
@@ -106,21 +107,22 @@ uint64_t getSplineModuliDoublePrecisionUlps(int splineOrder)
 }
 
 //! PME initialization
-PmeSafePointer pmeInitWrapper(const t_inputrec*        inputRec,
-                              const CodePath           mode,
-                              const DeviceInformation* deviceInfo,
-                              const PmeGpuProgram*     pmeGpuProgram,
-                              const Matrix3x3&         box,
-                              const real               ewaldCoeff_q,
-                              const real               ewaldCoeff_lj)
+PmeSafePointer pmeInitWrapper(const t_inputrec*    inputRec,
+                              const CodePath       mode,
+                              const DeviceContext* deviceContext,
+                              const DeviceStream*  deviceStream,
+                              const PmeGpuProgram* pmeGpuProgram,
+                              const Matrix3x3&     box,
+                              const real           ewaldCoeff_q,
+                              const real           ewaldCoeff_lj)
 {
     const MDLogger dummyLogger;
     const auto     runMode       = (mode == CodePath::CPU) ? PmeRunMode::CPU : PmeRunMode::Mixed;
     t_commrec      dummyCommrec  = { 0 };
     NumPmeDomains  numPmeDomains = { 1, 1 };
-    gmx_pme_t*     pmeDataRaw =
-            gmx_pme_init(&dummyCommrec, numPmeDomains, inputRec, false, false, true, ewaldCoeff_q,
-                         ewaldCoeff_lj, 1, runMode, nullptr, deviceInfo, pmeGpuProgram, dummyLogger);
+    gmx_pme_t* pmeDataRaw = gmx_pme_init(&dummyCommrec, numPmeDomains, inputRec, false, false, true,
+                                         ewaldCoeff_q, ewaldCoeff_lj, 1, runMode, nullptr,
+                                         deviceContext, deviceStream, pmeGpuProgram, dummyLogger);
     PmeSafePointer pme(pmeDataRaw); // taking ownership
 
     // TODO get rid of this with proper matrix type
@@ -151,33 +153,35 @@ PmeSafePointer pmeInitWrapper(const t_inputrec*        inputRec,
 }
 
 //! Simple PME initialization based on input, no atom data
-PmeSafePointer pmeInitEmpty(const t_inputrec*        inputRec,
-                            const CodePath           mode,
-                            const DeviceInformation* deviceInfo,
-                            const PmeGpuProgram*     pmeGpuProgram,
-                            const Matrix3x3&         box,
-                            const real               ewaldCoeff_q,
-                            const real               ewaldCoeff_lj)
-{
-    return pmeInitWrapper(inputRec, mode, deviceInfo, pmeGpuProgram, box, ewaldCoeff_q, ewaldCoeff_lj);
+PmeSafePointer pmeInitEmpty(const t_inputrec*    inputRec,
+                            const CodePath       mode,
+                            const DeviceContext* deviceContext,
+                            const DeviceStream*  deviceStream,
+                            const PmeGpuProgram* pmeGpuProgram,
+                            const Matrix3x3&     box,
+                            const real           ewaldCoeff_q,
+                            const real           ewaldCoeff_lj)
+{
+    return pmeInitWrapper(inputRec, mode, deviceContext, deviceStream, pmeGpuProgram, box,
+                          ewaldCoeff_q, ewaldCoeff_lj);
     // hiding the fact that PME actually needs to know the number of atoms in advance
 }
 
 PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec)
 {
     const Matrix3x3 defaultBox = { { 1.0F, 0.0F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F, 0.0F, 1.0F } };
-    return pmeInitWrapper(inputRec, CodePath::CPU, nullptr, nullptr, defaultBox, 0.0F, 0.0F);
+    return pmeInitWrapper(inputRec, CodePath::CPU, nullptr, nullptr, nullptr, defaultBox, 0.0F, 0.0F);
 }
 
 //! Make a GPU state-propagator manager
 std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t&     pme,
-                                                                   const DeviceContext& deviceContext)
+                                                                   const DeviceContext* deviceContext,
+                                                                   const DeviceStream* deviceStream)
 {
     // TODO: Pin the host buffer and use async memory copies
     // TODO: Special constructor for PME-only rank / PME-tests is used here. There should be a mechanism to
     //       restrict one from using other constructor here.
-    return std::make_unique<StatePropagatorDataGpu>(pme_gpu_get_device_stream(&pme), deviceContext,
-                                                    GpuApiCallBehavior::Sync,
+    return std::make_unique<StatePropagatorDataGpu>(deviceStream, *deviceContext, GpuApiCallBehavior::Sync,
                                                     pme_gpu_get_block_size(&pme), nullptr);
 }
 
index 98a2bbd4d2106092219be25eb8b8cffc6c9e22df..7f2e727c5f302ec2c9ac8185f71f050877627349 100644 (file)
@@ -55,6 +55,8 @@
 
 namespace gmx
 {
+
+class DeviceStreamManager;
 namespace test
 {
 
@@ -118,26 +120,31 @@ uint64_t getSplineModuliDoublePrecisionUlps(int splineOrder);
 // PME stages
 
 //! PME initialization
-PmeSafePointer pmeInitWrapper(const t_inputrec*        inputRec,
-                              CodePath                 mode,
-                              const DeviceInformation* deviceInfo,
-                              const PmeGpuProgram*     pmeGpuProgram,
-                              const Matrix3x3&         box,
-                              real                     ewaldCoeff_q  = 1.0F,
-                              real                     ewaldCoeff_lj = 1.0F);
+PmeSafePointer pmeInitWrapper(const t_inputrec*    inputRec,
+                              CodePath             mode,
+                              const DeviceContext* deviceContext,
+                              const DeviceStream*  deviceStream,
+                              const PmeGpuProgram* pmeGpuProgram,
+                              const Matrix3x3&     box,
+                              real                 ewaldCoeff_q  = 1.0F,
+                              real                 ewaldCoeff_lj = 1.0F);
 //! Simple PME initialization (no atom data)
-PmeSafePointer pmeInitEmpty(const t_inputrec*        inputRec,
-                            CodePath                 mode,
-                            const DeviceInformation* deviceInfo,
-                            const PmeGpuProgram*     pmeGpuProgram,
-                            const Matrix3x3&         box,
-                            real                     ewaldCoeff_q,
-                            real                     ewaldCoeff_lj);
+PmeSafePointer pmeInitEmpty(const t_inputrec*    inputRec,
+                            CodePath             mode,
+                            const DeviceContext* deviceContext,
+                            const DeviceStream*  deviceStream,
+                            const PmeGpuProgram* pmeGpuProgram,
+                            const Matrix3x3&     box,
+                            real                 ewaldCoeff_q,
+                            real                 ewaldCoeff_lj);
+
 //! Simple PME initialization based on inputrec only
 PmeSafePointer pmeInitEmpty(const t_inputrec* inputRec);
+
 //! Make a GPU state-propagator manager
 std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme_t&     pme,
-                                                                   const DeviceContext& deviceContext);
+                                                                   const DeviceContext* deviceContext,
+                                                                   const DeviceStream* deviceStream);
 //! PME initialization with atom data and system box
 void pmeInitAtoms(gmx_pme_t*               pme,
                   StatePropagatorDataGpu*  stateGpu,
diff --git a/src/gromacs/ewald/tests/testhardwarecontext.cpp b/src/gromacs/ewald/tests/testhardwarecontext.cpp
new file mode 100644 (file)
index 0000000..6e2c455
--- /dev/null
@@ -0,0 +1,124 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Implements test environment class which performs hardware enumeration for unit tests.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_ewald
+ */
+
+#include "gmxpre.h"
+
+#include "testhardwarecontext.h"
+
+#include <memory>
+
+#include "gromacs/ewald/pme.h"
+#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/detecthardware.h"
+#include "gromacs/hardware/hw_info.h"
+#include "gromacs/utility/basenetwork.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/loggerbuilder.h"
+#include "gromacs/utility/physicalnodecommunicator.h"
+
+namespace gmx
+{
+namespace test
+{
+
+TestHardwareContext::TestHardwareContext(CodePath codePath, const char* description) :
+    codePath_(codePath),
+    description_(description)
+{
+    GMX_RELEASE_ASSERT(codePath == CodePath::CPU,
+                       "A GPU code path should provide DeviceInformation to the "
+                       "TestHerdwareContext constructor.");
+    deviceContext_ = nullptr;
+    deviceStream_  = nullptr;
+}
+
+TestHardwareContext::TestHardwareContext(CodePath                 codePath,
+                                         const char*              description,
+                                         const DeviceInformation& deviceInfo) :
+    codePath_(codePath),
+    description_(description)
+{
+    GMX_RELEASE_ASSERT(codePath == CodePath::GPU,
+                       "TestHardwareContext tries to construct DeviceContext and PmeGpuProgram "
+                       "in CPU build.");
+    deviceContext_ = new DeviceContext(deviceInfo);
+    deviceStream_  = new DeviceStream(*deviceContext_, DeviceStreamPriority::Normal, false);
+    program_       = buildPmeGpuProgram(*deviceContext_);
+}
+
+TestHardwareContext::~TestHardwareContext()
+{
+    delete (deviceStream_);
+    delete (deviceContext_);
+}
+
+const DeviceInformation* TestHardwareContext::deviceInfo() const
+{
+    return &deviceContext_->deviceInfo();
+}
+
+const DeviceContext* TestHardwareContext::deviceContext() const
+{
+    return deviceContext_;
+}
+//! Get the device stream
+const DeviceStream* TestHardwareContext::deviceStream() const
+{
+    return deviceStream_;
+}
+
+const char* codePathToString(CodePath codePath)
+{
+    switch (codePath)
+    {
+        case CodePath::CPU: return "CPU";
+        case CodePath::GPU: return "GPU";
+        default: GMX_THROW(NotImplementedError("This CodePath should support codePathToString"));
+    }
+}
+
+} // namespace test
+} // namespace gmx
diff --git a/src/gromacs/ewald/tests/testhardwarecontext.h b/src/gromacs/ewald/tests/testhardwarecontext.h
new file mode 100644 (file)
index 0000000..fa5ebd9
--- /dev/null
@@ -0,0 +1,112 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_EWALD_TEST_HARDWARE_CONTEXT_H
+#define GMX_EWALD_TEST_HARDWARE_CONTEXT_H
+
+/*! \internal \file
+ * \brief
+ * Describes test environment class which performs hardware enumeration for unit tests.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ * \ingroup module_ewald
+ */
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "gromacs/ewald/pme_gpu_program.h"
+#include "gromacs/utility/gmxassert.h"
+
+class DeviceContext;
+struct DeviceInformation;
+class DeviceStream;
+
+namespace gmx
+{
+namespace test
+{
+//! Hardware code path being tested
+enum class CodePath
+{
+    CPU,
+    GPU
+};
+
+//! Return a string useful for human-readable messages describing a \c codePath.
+const char* codePathToString(CodePath codePath);
+
+/*! \internal \brief
+ * A structure to describe a hardware context  that persists over the lifetime
+ * of the test binary - an abstraction over PmeGpuProgram with a human-readable string.
+ */
+struct TestHardwareContext
+{
+    //! Hardware path for the code being tested.
+    CodePath codePath_;
+    //! Readable description
+    std::string description_;
+    //! Device context
+    DeviceContext* deviceContext_ = nullptr;
+    //! Device stream
+    DeviceStream* deviceStream_ = nullptr;
+    //! Persistent compiled GPU kernels for PME.
+    PmeGpuProgramStorage program_;
+
+public:
+    //! Retuns the code path for this context.
+    CodePath codePath() const { return codePath_; }
+    //! Returns a human-readable context description line
+    std::string description() const { return description_; }
+    //! Returns the device info pointer
+    const DeviceInformation* deviceInfo() const;
+    //! Get the device context
+    const DeviceContext* deviceContext() const;
+    //! Get the device stream
+    const DeviceStream* deviceStream() const;
+    //! Returns the persistent PME GPU kernels
+    const PmeGpuProgram* pmeGpuProgram() const { return program_.get(); }
+    //! Constructs the context for CPU builds
+    TestHardwareContext(CodePath codePath, const char* description);
+    //! Constructs the context for GPU builds
+    TestHardwareContext(CodePath codePath, const char* description, const DeviceInformation& deviceInfo);
+    //! Destructor
+    ~TestHardwareContext();
+};
+
+} // namespace test
+} // namespace gmx
+#endif
index 661f0fa4bb8c448d10432cdbf4dc17c204c8dc7d..96f36f9810899dc9a7ef0d492c9fe7587deb0bb9 100644 (file)
@@ -60,18 +60,6 @@ namespace gmx
 namespace test
 {
 
-TestHardwareContext::~TestHardwareContext() = default;
-
-const char* codePathToString(CodePath codePath)
-{
-    switch (codePath)
-    {
-        case CodePath::CPU: return "CPU";
-        case CodePath::GPU: return "GPU";
-        default: GMX_THROW(NotImplementedError("This CodePath should support codePathToString"));
-    }
-}
-
 /* Implements the "construct on first use" idiom to avoid any static
  * initialization order fiasco.
  *
@@ -120,8 +108,6 @@ void PmeTestEnvironment::SetUp()
     for (int gpuIndex : getCompatibleGpus(hardwareInfo_->gpu_info))
     {
         const DeviceInformation* deviceInfo = getDeviceInfo(hardwareInfo_->gpu_info, gpuIndex);
-        GMX_RELEASE_ASSERT(deviceInfo != nullptr,
-                           "Device information should be provided for the GPU builds.");
         init_gpu(deviceInfo);
 
         char stmp[200] = {};
@@ -132,5 +118,10 @@ void PmeTestEnvironment::SetUp()
     }
 }
 
+void PmeTestEnvironment::TearDown()
+{
+    hardwareContexts_.clear();
+}
+
 } // namespace test
 } // namespace gmx
index 0af2343795683cd88580bb2955b71a5bb2eebd0b..6a1450fe79b810074ec4268222e540bcb5ea3cf3 100644 (file)
 #include <gtest/gtest.h>
 
 #include "gromacs/ewald/pme_gpu_program.h"
-#include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/hardware/gpu_hw_info.h"
 #include "gromacs/utility/gmxassert.h"
 
+#include "testhardwarecontext.h"
+
 struct gmx_hw_info_t;
 
 namespace gmx
 {
 namespace test
 {
-//! Hardware code path being tested
-enum class CodePath
-{
-    CPU,
-    GPU
-};
-
-//! Return a string useful for human-readable messages describing a \c codePath.
-const char* codePathToString(CodePath codePath);
-
-/*! \internal \brief
- * A structure to describe a hardware context  that persists over the lifetime
- * of the test binary - an abstraction over PmeGpuProgram with a human-readable string.
- */
-struct TestHardwareContext
-{
-    //! Hardware path for the code being tested.
-    CodePath codePath_;
-    //! Readable description
-    std::string description_;
-    //! Device information pointer
-    const DeviceInformation* deviceInfo_;
-    //! Local copy of the device context pointer
-    std::unique_ptr<DeviceContext> deviceContext_;
-    //! Persistent compiled GPU kernels for PME.
-    PmeGpuProgramStorage program_;
-
-public:
-    //! Retuns the code path for this context.
-    CodePath getCodePath() const { return codePath_; }
-    //! Returns a human-readable context description line
-    std::string getDescription() const { return description_; }
-    //! Getter for the DeviceContext
-    const DeviceContext& deviceContext() const
-    {
-        GMX_RELEASE_ASSERT(deviceContext_ != nullptr,
-                           "Trying to get device context before it was initialized or in builds "
-                           "without GPU support.");
-        return *deviceContext_;
-    }
-    //! Returns the device info pointer
-    const DeviceInformation* getDeviceInfo() const { return deviceInfo_; }
-    //! Returns the persistent PME GPU kernels
-    const PmeGpuProgram* getPmeGpuProgram() const { return program_.get(); }
-    //! Constructs the context for CPU builds
-    TestHardwareContext(CodePath codePath, const char* description) :
-        codePath_(codePath),
-        description_(description)
-    {
-        GMX_RELEASE_ASSERT(codePath == CodePath::CPU,
-                           "A GPU code path should provide DeviceInformation to the "
-                           "TestHardwareContext constructor.");
-    }
-    //! Constructs the context for GPU builds
-    TestHardwareContext(CodePath codePath, const char* description, const DeviceInformation& deviceInfo) :
-        codePath_(codePath),
-        description_(description),
-        deviceInfo_(&deviceInfo)
-    {
-        GMX_RELEASE_ASSERT(codePath == CodePath::GPU,
-                           "TestHardwareContext tries to construct DeviceContext and PmeGpuProgram "
-                           "in CPU build.");
-        deviceContext_ = std::make_unique<DeviceContext>(deviceInfo);
-        program_       = buildPmeGpuProgram(*deviceContext_);
-    }
-    ~TestHardwareContext();
-};
 
 //! A container of handles to hardware contexts
 typedef std::vector<std::unique_ptr<TestHardwareContext>> TestHardwareContexts;
@@ -144,6 +78,8 @@ private:
 public:
     //! This is called by GTest framework once to query the hardware
     void SetUp() override;
+    //! This is called by GTest framework once release the hardware
+    void TearDown() override;
     //! Get available hardware contexts.
     const TestHardwareContexts& getHardwareContexts() const { return hardwareContexts_; }
     //! Get available hardware information.
index 7f05de6b321999f905dcf7a3afda9174dc7a3eec..bfbe049235fdd571bf844c689047a990770532f7 100644 (file)
@@ -59,4 +59,4 @@ bool DeviceStream::isValid() const
     return false;
 }
 
-void DeviceStream::synchronize() const {}
+void DeviceStream::synchronize() const {};
index 1c8228e5902413616790adc6df20cbe4b0f5abee..8c7457a3d3b0da8c8b44b5f4ddef87632a14f162 100644 (file)
@@ -156,6 +156,24 @@ const DeviceStream& DeviceStreamManager::stream(DeviceStreamType streamToGet) co
     return impl_->streams_[streamToGet];
 }
 
+const DeviceStream& DeviceStreamManager::bondedStream(bool hasPPDomainDecomposition) const
+{
+    if (hasPPDomainDecomposition)
+    {
+        GMX_RELEASE_ASSERT(stream(DeviceStreamType::NonBondedNonLocal).isValid(),
+                           "GPU non-bonded non-local stream should be valid in order to use GPU "
+                           "version of bonded forces with domain decomposition.");
+        return stream(DeviceStreamType::NonBondedNonLocal);
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(stream(DeviceStreamType::NonBondedLocal).isValid(),
+                           "GPU non-bonded local stream should be valid in order to use GPU "
+                           "version of bonded forces without domain decomposition.");
+        return stream(DeviceStreamType::NonBondedLocal);
+    }
+}
+
 bool DeviceStreamManager::streamIsValid(DeviceStreamType streamToCheck) const
 {
     return impl_->streams_[streamToCheck].isValid();
index 4cfa6161a429c46d1426c6dd65338cedc931904b..4565d1ac0c2044633420536cd4aecfffea5e07d6 100644 (file)
@@ -124,6 +124,12 @@ public:
      */
     const DeviceStream& stream(DeviceStreamType streamToGet) const;
 
+    /*! \brief Returns a handle to the GPU stream to compute bonded forces in.
+     *
+     * \param[in] hasPPDomainDecomposition Whether there is a particle-particle domain decomposition.
+     */
+    const DeviceStream& bondedStream(bool hasPPDomainDecomposition) const;
+
     /*! \brief Return whether the requested GPU stream is valid for use.
      *
      * \param[in] streamToCheck Which stream to check.
index 1c0330e02a1be2749f176f8c383b79d0990b0fdc..e75c80bc079970e28597aa038e0fe18f775c9800 100644 (file)
@@ -171,7 +171,6 @@ TEST_F(DeviceStreamManagerTest, CorrectStreamsAreReturnedOnNonbondedDevice)
             expectValidStreams(&manager, { DeviceStreamType::Pme, DeviceStreamType::NonBondedLocal,
                                            DeviceStreamType::NonBondedNonLocal, DeviceStreamType::PmePpTransfer,
                                            DeviceStreamType::UpdateAndConstraints });
-            expectInvalidStreams(&manager, {});
         }
 
         {
index 0d5367f698f54e46a8e13a112dc852130f1fa2a2..faa775a0f7d9cd62529b9af9a92dffc480bcc4a1 100644 (file)
@@ -51,6 +51,7 @@
 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/typecasts.cuh"
 #include "gromacs/mdtypes/enerdata.h"
@@ -71,6 +72,9 @@ GpuBonded::Impl::Impl(const gmx_ffparams_t& ffparams,
     deviceContext_(deviceContext),
     deviceStream_(deviceStream)
 {
+    GMX_RELEASE_ASSERT(deviceStream.isValid(),
+                       "Can't run GPU version of bonded forces in stream that is not valid.");
+
     wcycle_ = wcycle;
 
     allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), deviceContext_);
@@ -81,7 +85,7 @@ GpuBonded::Impl::Impl(const gmx_ffparams_t& ffparams,
                        deviceStream_, GpuApiCallBehavior::Sync, nullptr);
     vTot_.resize(F_NRE);
     allocateDeviceBuffer(&d_vTot_, F_NRE, deviceContext_);
-    clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream);
+    clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream_);
 
     kernelParams_.d_forceParams = d_forceParams_;
     kernelParams_.d_xq          = d_xq_;
index 562c1be500972fa717e733dfe24d1b7d6a271f9d..76899bbd82c21ed484d8f7f4977b524d62f90295 100644 (file)
@@ -58,6 +58,7 @@
 
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/gpu_utils/vectype_ops.cuh"
@@ -121,8 +122,8 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer*             fRead
     // Make sure that the forces are ready on device before proceeding with the update.
     fReadyOnDevice->enqueueWaitEvent(deviceStream_);
 
-    // The integrate should save a copy of the current coordinates in d_xp_ and write updated once
-    // into d_x_. The d_xp_ is only needed by constraints.
+    // The integrate should save a copy of the current coordinates in d_xp_ and write updated
+    // once into d_x_. The d_xp_ is only needed by constraints.
     integrator_->integrate(d_x_, d_xp_, d_v_, d_f_, dt, doTemperatureScaling, tcstat,
                            doParrinelloRahman, dtPressureCouple, prVelocityScalingMatrix);
     // Constraints need both coordinates before (d_x_) and after (d_xp_) update. However, after constraints
index 9d341e1eaeb646bd71d72d14b0c0d043013596cc..f7f39bbee79329875fdac34f78629b438f99841f 100644 (file)
@@ -67,6 +67,7 @@
 #include "gromacs/fileio/trxio.h"
 #include "gromacs/gmxlib/network.h"
 #include "gromacs/gmxlib/nrnb.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/imd/imd.h"
 #include "gromacs/listed_forces/manage_threading.h"
@@ -353,6 +354,7 @@ void gmx::LegacySimulator::do_md()
 
     StatePropagatorDataGpu* stateGpu = fr->stateGpu;
 
+    // TODO: the assertions below should be handled by UpdateConstraintsBuilder.
     if (useGpuForUpdate)
     {
         GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
@@ -397,14 +399,17 @@ void gmx::LegacySimulator::do_md()
         {
             GMX_LOG(mdlog.info).asParagraph().appendText("Updating coordinates on the GPU.");
         }
-
-        GMX_RELEASE_ASSERT(fr->deviceContext != nullptr,
-                           "GPU device context should be initialized to use GPU update.");
-        GMX_RELEASE_ASSERT(stateGpu->getUpdateStream() != nullptr,
-                           "Update stream can not be nullptr when update is on a GPU.");
-        integrator = std::make_unique<UpdateConstrainGpu>(*ir, *top_global, *fr->deviceContext,
-                                                          *stateGpu->getUpdateStream(),
-                                                          stateGpu->xUpdatedOnDevice());
+        GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
+                           "Device stream manager should be initialized in order to use GPU "
+                           "update-constraints.");
+        GMX_RELEASE_ASSERT(
+                fr->deviceStreamManager->streamIsValid(gmx::DeviceStreamType::UpdateAndConstraints),
+                "Update stream should be initialized in order to use GPU "
+                "update-constraints.");
+        integrator = std::make_unique<UpdateConstrainGpu>(
+                *ir, *top_global, fr->deviceStreamManager->context(),
+                fr->deviceStreamManager->stream(gmx::DeviceStreamType::UpdateAndConstraints),
+                stateGpu->xUpdatedOnDevice());
 
         integrator->setPbc(PbcType::Xyz, state->box);
     }
@@ -864,21 +869,11 @@ void gmx::LegacySimulator::do_md()
                 if (havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange
                     && useGpuForNonbonded && is1D(*cr->dd))
                 {
+                    GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
+                                       "GPU device manager has to be initialized to use GPU "
+                                       "version of halo exchange.");
                     // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
-                    const DeviceStream* localStream =
-                            Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
-                    const DeviceStream* nonLocalStream = Nbnxm::gpu_get_command_stream(
-                            fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
-                    GMX_RELEASE_ASSERT(
-                            fr->deviceContext != nullptr,
-                            "GPU device context should be initialized to use GPU halo exchange.");
-                    GMX_RELEASE_ASSERT(localStream != nullptr,
-                                       "Local non-bonded stream can't be nullptr when using GPU "
-                                       "halo exchange.");
-                    GMX_RELEASE_ASSERT(nonLocalStream != nullptr,
-                                       "Non-local non-bonded stream can't be nullptr when using "
-                                       "GPU halo exchange.");
-                    constructGpuHaloExchange(mdlog, *cr, *fr->deviceContext, *localStream, *nonLocalStream);
+                    constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager);
                 }
             }
         }
index 604ff0ed7bf7c2ed50829d0c830c166358f23359..43ac7d6164a7ddf3aef5cb1d03d089103fc4c851 100644 (file)
@@ -74,6 +74,7 @@
 #include "gromacs/gmxlib/network.h"
 #include "gromacs/gmxlib/nrnb.h"
 #include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/hardware/cpuinfo.h"
 #include "gromacs/hardware/detecthardware.h"
@@ -1141,19 +1142,23 @@ int Mdrunner::mdrunner()
             EEL_PME(inputrec->coulombtype) && thisRankHasDuty(cr, DUTY_PME));
 
     // Get the device handles for the modules, nullptr when no task is assigned.
-    int                            deviceId      = -1;
-    DeviceInformation*             deviceInfo    = gpuTaskAssignments.initDevice(&deviceId);
-    std::unique_ptr<DeviceContext> deviceContext = nullptr;
-    if (deviceInfo != nullptr)
+    int                deviceId   = -1;
+    DeviceInformation* deviceInfo = gpuTaskAssignments.initDevice(&deviceId);
+
+    // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?)
+    bool useTiming = true;
+    if (GMX_GPU == GMX_GPU_CUDA)
     {
-        if (DOMAINDECOMP(cr) && thisRankHasDuty(cr, DUTY_PP))
-        {
-            dd_setup_dlb_resource_sharing(cr, deviceId);
-        }
-        deviceContext = std::make_unique<DeviceContext>(*deviceInfo);
+        /* WARNING: CUDA timings are incorrect with multiple streams.
+         *          This is the main reason why they are disabled by default.
+         */
+        // TODO: Consider turning on by default when we can detect nr of streams.
+        useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
+    }
+    else if (GMX_GPU == GMX_GPU_OPENCL)
+    {
+        useTiming = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
     }
-
-    // TODO Initialize GPU streams here.
 
     // TODO Currently this is always built, yet DD partition code
     // checks if it is built before using it. Probably it should
@@ -1190,6 +1195,19 @@ int Mdrunner::mdrunner()
     const bool printHostName = (cr->nnodes > 1);
     gpuTaskAssignments.reportGpuUsage(mdlog, printHostName, useGpuForBonded, pmeRunMode, useGpuForUpdate);
 
+    std::unique_ptr<DeviceStreamManager> deviceStreamManager = nullptr;
+
+    if (deviceInfo != nullptr)
+    {
+        if (DOMAINDECOMP(cr) && thisRankHasDuty(cr, DUTY_PP))
+        {
+            dd_setup_dlb_resource_sharing(cr, deviceId);
+        }
+        deviceStreamManager = std::make_unique<DeviceStreamManager>(
+                *deviceInfo, useGpuForPme, useGpuForNonbonded, havePPDomainDecomposition(cr),
+                useGpuForUpdate, useTiming);
+    }
+
     // If the user chose a task assignment, give them some hints
     // where appropriate.
     if (!userGpuTaskAssignment.empty())
@@ -1348,32 +1366,36 @@ int Mdrunner::mdrunner()
                       opt2fn("-tablep", filenames.size(), filenames.data()),
                       opt2fns("-tableb", filenames.size(), filenames.data()), pforce);
 
-        fr->deviceContext = deviceContext.get();
+        // Save a handle to device stream manager to use elsewhere in the code
+        // TODO: Forcerec is not a correct place to store it.
+        fr->deviceStreamManager = deviceStreamManager.get();
 
         if (devFlags.enableGpuPmePPComm && !thisRankHasDuty(cr, DUTY_PME))
         {
             GMX_RELEASE_ASSERT(
-                    deviceContext != nullptr,
-                    "Device context can not be nullptr when PME-PP direct communications object.");
+                    deviceStreamManager != nullptr,
+                    "GPU device stream manager should be valid in order to use PME-PP direct "
+                    "communications.");
+            GMX_RELEASE_ASSERT(
+                    deviceStreamManager->streamIsValid(DeviceStreamType::PmePpTransfer),
+                    "GPU PP-PME stream should be valid in order to use GPU PME-PP direct "
+                    "communications.");
             fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(
-                    cr->mpi_comm_mysim, cr->dd->pme_nodeid, *deviceContext);
+                    cr->mpi_comm_mysim, cr->dd->pme_nodeid, deviceStreamManager->context(),
+                    deviceStreamManager->stream(DeviceStreamType::PmePpTransfer));
         }
 
-        fr->nbv = Nbnxm::init_nb_verlet(mdlog, inputrec, fr, cr, *hwinfo, deviceInfo,
-                                        fr->deviceContext, &mtop, box, wcycle);
+        fr->nbv = Nbnxm::init_nb_verlet(mdlog, inputrec, fr, cr, *hwinfo, useGpuForNonbonded,
+                                        deviceStreamManager.get(), &mtop, box, wcycle);
+        // TODO: Move the logic below to a GPU bonded builder
         if (useGpuForBonded)
         {
-            auto stream = havePPDomainDecomposition(cr)
-                                  ? Nbnxm::gpu_get_command_stream(
-                                            fr->nbv->gpu_nbv, gmx::InteractionLocality::NonLocal)
-                                  : Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv,
-                                                                  gmx::InteractionLocality::Local);
-            GMX_RELEASE_ASSERT(
-                    fr->deviceContext != nullptr,
-                    "Device context can not be nullptr when computing bonded interactions on GPU.");
-            GMX_RELEASE_ASSERT(stream != nullptr,
-                               "Can'r run GPU version of bonded forces in nullptr stream.");
-            gpuBonded = std::make_unique<GpuBonded>(mtop.ffparams, *fr->deviceContext, *stream, wcycle);
+            GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
+                               "GPU device stream manager should be valid in order to use GPU "
+                               "version of bonded forces.");
+            gpuBonded = std::make_unique<GpuBonded>(
+                    mtop.ffparams, deviceStreamManager->context(),
+                    deviceStreamManager->bondedStream(havePPDomainDecomposition(cr)), wcycle);
             fr->gpuBonded = gpuBonded.get();
         }
 
@@ -1450,9 +1472,11 @@ int Mdrunner::mdrunner()
     if (thisRankHasPmeGpuTask)
     {
         GMX_RELEASE_ASSERT(
-                deviceContext != nullptr,
-                "Device context can not be nullptr when building PME GPU program object.");
-        pmeGpuProgram = buildPmeGpuProgram(*deviceContext);
+                (deviceStreamManager != nullptr),
+                "GPU device stream manager should be initialized in order to use GPU for PME.");
+        GMX_RELEASE_ASSERT((deviceInfo != nullptr),
+                           "GPU device should be initialized in order to use GPU for PME.");
+        pmeGpuProgram = buildPmeGpuProgram(deviceStreamManager->context());
     }
 
     /* Initiate PME if necessary,
@@ -1478,10 +1502,23 @@ int Mdrunner::mdrunner()
         {
             try
             {
+                // TODO: This should be in the builder.
+                GMX_RELEASE_ASSERT(!useGpuForPme || (deviceStreamManager != nullptr),
+                                   "Device stream manager should be valid in order to use GPU "
+                                   "version of PME.");
+                GMX_RELEASE_ASSERT(
+                        !useGpuForPme || deviceStreamManager->streamIsValid(DeviceStreamType::Pme),
+                        "GPU PME stream should be valid in order to use GPU version of PME.");
+
+                const DeviceContext* deviceContext =
+                        useGpuForPme ? &deviceStreamManager->context() : nullptr;
+                const DeviceStream* pmeStream =
+                        useGpuForPme ? &deviceStreamManager->stream(DeviceStreamType::Pme) : nullptr;
+
                 pmedata = gmx_pme_init(cr, getNumPmeDomains(cr->dd), inputrec, nChargePerturbed != 0,
                                        nTypePerturbed != 0, mdrunOptions.reproducible, ewaldcoeff_q,
                                        ewaldcoeff_lj, gmx_omp_nthreads_get(emntPME), pmeRunMode,
-                                       nullptr, deviceInfo, pmeGpuProgram.get(), mdlog);
+                                       nullptr, deviceContext, pmeStream, pmeGpuProgram.get(), mdlog);
             }
             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
         }
@@ -1581,24 +1618,13 @@ int Mdrunner::mdrunner()
             && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME))
                 || runScheduleWork.simulationWork.useGpuBufferOps))
         {
-            const DeviceStream* pmeStream = pme_gpu_get_device_stream(fr->pmedata);
-            const DeviceStream* localStream =
-                    fr->nbv->gpu_nbv != nullptr
-                            ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local)
-                            : nullptr;
-            const DeviceStream* nonLocalStream =
-                    fr->nbv->gpu_nbv != nullptr
-                            ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal)
-                            : nullptr;
             GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
                                                       ? GpuApiCallBehavior::Async
                                                       : GpuApiCallBehavior::Sync;
-            GMX_RELEASE_ASSERT(
-                    deviceContext != nullptr,
-                    "Device context can not be nullptr when building GPU propagator data object.");
+            GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
+                               "GPU device stream manager should be initialized to use GPU.");
             stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
-                    pmeStream, localStream, nonLocalStream, *deviceContext, transferKind,
-                    pme_gpu_get_block_size(fr->pmedata), wcycle);
+                    *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle);
             fr->stateGpu = stateGpu.get();
         }
 
@@ -1634,7 +1660,7 @@ int Mdrunner::mdrunner()
         /* do PME only */
         walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
         gmx_pmeonly(pmedata, cr, &nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode,
-                    deviceContext.get());
+                    deviceStreamManager.get());
     }
 
     wallcycle_stop(wcycle, ewcRUN);
@@ -1648,6 +1674,7 @@ int Mdrunner::mdrunner()
     // clean up cycle counter
     wallcycle_destroy(wcycle);
 
+    deviceStreamManager.reset(nullptr);
     // Free PME data
     if (pmedata)
     {
@@ -1695,7 +1722,6 @@ int Mdrunner::mdrunner()
     }
 
     free_gpu(deviceInfo);
-    deviceContext.reset(nullptr);
     sfree(fcd);
 
     if (doMembed)
@@ -1732,7 +1758,7 @@ int Mdrunner::mdrunner()
     }
 #endif
     return rc;
-}
+} // namespace gmx
 
 Mdrunner::~Mdrunner()
 {
index f518692e7d037bf52f00c5cd36741ff8226c8fab..169cb1c15348833fcb1159768fafd71a3adf0d49 100644 (file)
@@ -59,6 +59,7 @@ struct t_QMMMrec;
 
 namespace gmx
 {
+class DeviceStreamManager;
 class GpuBonded;
 class ForceProviders;
 class StatePropagatorDataGpu;
@@ -284,6 +285,8 @@ struct t_forcerec
     // TODO: This is not supposed to be here. StatePropagatorDataGpu should be a part of
     //       general StatePropagatorData object that is passed around
     gmx::StatePropagatorDataGpu* stateGpu = nullptr;
+    // TODO: Should not be here. This is here only to pass the pointer around.
+    gmx::DeviceStreamManager* deviceStreamManager = nullptr;
 
     //! GPU device context
     DeviceContext* deviceContext = nullptr;
index a4f77cbf164d2fcd36f5d0ae25c934bfb2955f6f..bcc6dc5c384bbad413b65489b53301e3b9d4f58b 100644 (file)
@@ -66,6 +66,7 @@ struct gmx_wallcycle;
 
 namespace gmx
 {
+class DeviceStreamManager;
 
 class StatePropagatorDataGpu
 {
@@ -86,35 +87,15 @@ public:
      * ops are offloaded. This feature is currently not available in OpenCL and
      * hence these streams are not set in these builds.
      *
-     * \note In CUDA, the update stream is created in the constructor as a temporary
-     *       solution, in place until the stream manager is introduced.
-     *       Note that this makes it impossible to construct this object in CUDA
-     *       builds executing on a host without any CUDA-capable device available.
-     *
-     * \note In CUDA, \p deviceContext is unused, hence always nullptr;
-     *       all stream arguments can also be nullptr in runs where the
-     *       respective streams are not required.
-     *       In OpenCL, \p deviceContext needs to be a valid device context.
-     *       In OpenCL runs StatePropagatorDataGpu is currently only used
-     *       with PME offload, and only on ranks with PME duty. Hence, the
-     *       \p pmeStream argument needs to be a valid OpenCL queue object
-     *       which must have been created in \p deviceContext.
-     *
-     *  \param[in] pmeStream       Device PME stream, nullptr allowed.
-     *  \param[in] localStream     Device NBNXM local stream, nullptr allowed.
-     *  \param[in] nonLocalStream  Device NBNXM non-local stream, nullptr allowed.
-     *  \param[in] deviceContext   Device context, nullptr allowed.
-     *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
+     *  \param[in] deviceStreamManager         Object that owns the DeviceContext and DeviceStreams.
+     *  \param[in] transferKind                H2D/D2H transfer call behavior (synchronous or not).
      *  \param[in] allocationBlockSizeDivisor  Deterines padding size for coordinates buffer.
-     *  \param[in] wcycle          Wall cycle counter data.
+     *  \param[in] wcycle                      Wall cycle counter data.
      */
-    StatePropagatorDataGpu(const DeviceStream*  pmeStream,
-                           const DeviceStream*  localStream,
-                           const DeviceStream*  nonLocalStream,
-                           const DeviceContext& deviceContext,
-                           GpuApiCallBehavior   transferKind,
-                           int                  allocationBlockSizeDivisor,
-                           gmx_wallcycle*       wcycle);
+    StatePropagatorDataGpu(const DeviceStreamManager& deviceStreamManager,
+                           GpuApiCallBehavior         transferKind,
+                           int                        allocationBlockSizeDivisor,
+                           gmx_wallcycle*             wcycle);
 
     /*! \brief Constructor to use in PME-only rank and in tests.
      *
index 68c884f99b6e448adba3205219ac68b20e1f978a..269b6eb3778650061ccbf9c82789e9da4b357350 100644 (file)
@@ -54,10 +54,7 @@ class StatePropagatorDataGpu::Impl
 {
 };
 
-StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream* /* pmeStream       */,
-                                               const DeviceStream* /* localStream     */,
-                                               const DeviceStream* /* nonLocalStream  */,
-                                               const DeviceContext& /* deviceContext   */,
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStreamManager& /* deviceStreamManager */,
                                                GpuApiCallBehavior /* transferKind    */,
                                                int /* allocationBlockSizeDivisor */,
                                                gmx_wallcycle* /*   wcycle */) :
index b0576925479832645669ca9d2fd0e7e6e754ab60..fd9ff197addc5c3ee41dd21c7bfb9063802a4150 100644 (file)
@@ -85,35 +85,15 @@ public:
      * ops are offloaded. This feature is currently not available in OpenCL and
      * hence these streams are not set in these builds.
      *
-     * \note In CUDA, the update stream is created in the constructor as a temporary
-     *       solution, in place until the stream manager is introduced.
-     *       Note that this makes it impossible to construct this object in CUDA
-     *       builds executing on a host without any CUDA-capable device available.
-     *
-     * \note In CUDA, \p deviceContext is unused, hence always nullptr;
-     *       all stream arguments can also be nullptr in runs where the
-     *       respective streams are not required.
-     *       In OpenCL, \p deviceContext needs to be a valid device context.
-     *       In OpenCL runs StatePropagatorDataGpu is currently only used
-     *       with PME offload, and only on ranks with PME duty. Hence, the
-     *       \p pmeStream argument needs to be a valid OpenCL queue object
-     *       which must have been created in \p deviceContext.
-     *
-     *  \param[in] pmeStream       Device PME stream, nullptr allowed.
-     *  \param[in] localStream     Device NBNXM local stream, nullptr allowed.
-     *  \param[in] nonLocalStream  Device NBNXM non-local stream, nullptr allowed.
-     *  \param[in] deviceContext   Device context, nullptr allowed.
-     *  \param[in] transferKind    H2D/D2H transfer call behavior (synchronous or not).
+     *  \param[in] deviceStreamManager         Object that owns the DeviceContext and DeviceStreams.
+     *  \param[in] transferKind                H2D/D2H transfer call behavior (synchronous or not).
      *  \param[in] allocationBlockSizeDivisor  Determines the padding size for coordinates buffer.
-     *  \param[in] wcycle          Wall cycle counter data.
+     *  \param[in] wcycle                      Wall cycle counter data.
      */
-    Impl(const DeviceStream*  pmeStream,
-         const DeviceStream*  localStream,
-         const DeviceStream*  nonLocalStream,
-         const DeviceContext& deviceContext,
-         GpuApiCallBehavior   transferKind,
-         int                  allocationBlockSizeDivisor,
-         gmx_wallcycle*       wcycle);
+    Impl(const DeviceStreamManager& deviceStreamManager,
+         GpuApiCallBehavior         transferKind,
+         int                        allocationBlockSizeDivisor,
+         gmx_wallcycle*             wcycle);
 
     /*! \brief Constructor to use in PME-only rank and in tests.
      *
@@ -346,9 +326,6 @@ private:
     //! GPU Update-constreaints stream.
     const DeviceStream* updateStream_;
 
-    //! An owning pointer to the update stream, in case we manage its lifetime here. Temporary.
-    DeviceStream updateStreamOwn_;
-
     // Streams to use for coordinates H2D and D2H copies (one event for each atom locality)
     EnumerationArray<AtomLocality, const DeviceStream*> xCopyStreams_ = { { nullptr } };
     // Streams to use for velocities H2D and D2H copies (one event for each atom locality)
index e60e9fa73b7eb7ded130cccc5d27699d6c63da84..bf927f2da297084caae44ef98a499038fabbccc7 100644 (file)
 
 #if GMX_GPU != GMX_GPU_NONE
 
-#    if GMX_GPU == GMX_GPU_CUDA
-#        include "gromacs/gpu_utils/cudautils.cuh"
-#    endif
+#    include "gromacs/gpu_utils/device_stream_manager.h"
 #    include "gromacs/gpu_utils/devicebuffer.h"
 #    include "gromacs/gpu_utils/gputraits.h"
-#    if GMX_GPU == GMX_GPU_OPENCL
-#        include "gromacs/gpu_utils/oclutils.h"
-#    endif
 #    include "gromacs/math/vectypes.h"
 #    include "gromacs/mdtypes/state_propagator_data_gpu.h"
 #    include "gromacs/timing/wallcycle.h"
 namespace gmx
 {
 
-StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
-                                   const DeviceStream*  localStream,
-                                   const DeviceStream*  nonLocalStream,
-                                   const DeviceContext& deviceContext,
-                                   GpuApiCallBehavior   transferKind,
-                                   int                  allocationBlockSizeDivisor,
-                                   gmx_wallcycle*       wcycle) :
-    deviceContext_(deviceContext),
+StatePropagatorDataGpu::Impl::Impl(const DeviceStreamManager& deviceStreamManager,
+                                   GpuApiCallBehavior         transferKind,
+                                   int                        allocationBlockSizeDivisor,
+                                   gmx_wallcycle*             wcycle) :
+    deviceContext_(deviceStreamManager.context()),
     transferKind_(transferKind),
     allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
     wcycle_(wcycle)
 {
-    static_assert(GMX_GPU != GMX_GPU_NONE,
-                  "This object should only be constructed on the GPU code-paths.");
+    static_assert(
+            GMX_GPU != GMX_GPU_NONE,
+            "GPU state propagator data object should only be constructed on the GPU code-paths.");
 
-    // TODO: Refactor when the StreamManager is introduced.
+    // We need to keep local copies for re-initialization.
+    pmeStream_      = &deviceStreamManager.stream(DeviceStreamType::Pme);
+    localStream_    = &deviceStreamManager.stream(DeviceStreamType::NonBondedLocal);
+    nonLocalStream_ = &deviceStreamManager.stream(DeviceStreamType::NonBondedNonLocal);
+    // PME stream is used in OpenCL for H2D coordinate transfer
     if (GMX_GPU == GMX_GPU_OPENCL)
     {
-        GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set in OpenCL builds.");
-
-        // The update stream is set to the PME stream in OpenCL, since PME stream is the only stream created in the PME context.
-        pmeStream_    = pmeStream;
-        updateStream_ = pmeStream;
-        GMX_UNUSED_VALUE(localStream);
-        GMX_UNUSED_VALUE(nonLocalStream);
+        updateStream_ = &deviceStreamManager.stream(DeviceStreamType::Pme);
     }
-
-    if (GMX_GPU == GMX_GPU_CUDA)
+    else
     {
-        if (pmeStream != nullptr)
-        {
-            pmeStream_ = pmeStream;
-        }
-        if (localStream != nullptr)
-        {
-            localStream_ = localStream;
-        }
-        if (nonLocalStream != nullptr)
-        {
-            nonLocalStream_ = nonLocalStream;
-        }
-
-        // TODO: The update stream should be created only when it is needed.
-#    if (GMX_GPU == GMX_GPU_CUDA)
-        // In CUDA we only need priority to create stream.
-        // (note that this will be moved from here in the follow-up patch)
-        updateStreamOwn_.init(deviceContext, DeviceStreamPriority::Normal, false);
-        updateStream_ = &updateStreamOwn_;
-#    endif
+        updateStream_ = &deviceStreamManager.stream(DeviceStreamType::UpdateAndConstraints);
     }
 
     // Map the atom locality to the stream that will be used for coordinates,
@@ -142,10 +113,11 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
     allocationBlockSizeDivisor_(allocationBlockSizeDivisor),
     wcycle_(wcycle)
 {
-    static_assert(GMX_GPU != GMX_GPU_NONE,
-                  "This object should only be constructed on the GPU code-paths.");
+    static_assert(
+            GMX_GPU != GMX_GPU_NONE,
+            "GPU state propagator data object should only be constructed on the GPU code-paths.");
 
-    GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set.");
+    GMX_ASSERT(pmeStream->isValid(), "GPU PME stream should be valid.");
     pmeStream_      = pmeStream;
     localStream_    = pmeStream; // For clearing the force buffer
     nonLocalStream_ = nullptr;
@@ -256,8 +228,7 @@ void StatePropagatorDataGpu::Impl::copyToDevice(DeviceBuffer<RVec>
 
     GMX_ASSERT(dataSize >= 0, "Trying to copy to device buffer before it was allocated.");
 
-    GMX_ASSERT(deviceStream.stream() != nullptr,
-               "No stream is valid for copying with given atom locality.");
+    GMX_ASSERT(deviceStream.isValid(), "No stream is valid for copying with given atom locality.");
     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 
@@ -291,8 +262,7 @@ void StatePropagatorDataGpu::Impl::copyFromDevice(gmx::ArrayRef<gmx::RVec> h_dat
 
     GMX_ASSERT(dataSize >= 0, "Trying to copy from device buffer before it was allocated.");
 
-    GMX_ASSERT(deviceStream.stream() != nullptr,
-               "No stream is valid for copying with given atom locality.");
+    GMX_ASSERT(deviceStream.isValid(), "No stream is valid for copying with given atom locality.");
     wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
     wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
 
@@ -546,14 +516,11 @@ int StatePropagatorDataGpu::Impl::numAtomsAll()
 }
 
 
-StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStream*  pmeStream,
-                                               const DeviceStream*  localStream,
-                                               const DeviceStream*  nonLocalStream,
-                                               const DeviceContext& deviceContext,
-                                               GpuApiCallBehavior   transferKind,
-                                               int                  allocationBlockSizeDivisor,
-                                               gmx_wallcycle*       wcycle) :
-    impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, allocationBlockSizeDivisor, wcycle))
+StatePropagatorDataGpu::StatePropagatorDataGpu(const DeviceStreamManager& deviceStreamManager,
+                                               GpuApiCallBehavior         transferKind,
+                                               int            allocationBlockSizeDivisor,
+                                               gmx_wallcycle* wcycle) :
+    impl_(new Impl(deviceStreamManager, transferKind, allocationBlockSizeDivisor, wcycle))
 {
 }
 
index c015326e8dbee594583eda8bf566090b178c9c24..95aca5ba1d5dec5cf51faca94fa3ba4909791de4 100644 (file)
@@ -366,7 +366,7 @@ static inline int calc_shmem_required_nonbonded(const int               num_thre
  */
 void nbnxnInsertNonlocalGpuDependency(const NbnxmGpu* nb, const InteractionLocality interactionLocality)
 {
-    const DeviceStream& deviceStream = nb->deviceStreams[interactionLocality];
+    const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
 
     /* When we get here all misc operations issued in the local stream as well as
        the local xq H2D are done,
@@ -405,7 +405,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     cu_atomdata_t*      adat         = nb->atdat;
     cu_plist_t*         plist        = nb->plist[iloc];
     cu_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
 
@@ -485,7 +485,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In
     cu_nbparam_t*       nbp          = nb->nbparam;
     cu_plist_t*         plist        = nb->plist[iloc];
     cu_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
 
@@ -598,7 +598,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     cu_nbparam_t*       nbp          = nb->nbparam;
     cu_plist_t*         plist        = nb->plist[iloc];
     cu_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
 
@@ -732,7 +732,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     cu_atomdata_t*      adat         = nb->atdat;
     cu_timers_t*        t            = nb->timers;
     bool                bDoTime      = nb->bDoTime;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     /* don't launch non-local copy-back if there was no non-local work to do */
     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
@@ -836,7 +836,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
     const int                  numAtomsPerCell = grid.numAtomsPerCell();
     Nbnxm::InteractionLocality interactionLoc  = gpuAtomToInteractionLocality(locality);
 
-    const DeviceStream& deviceStream = nb->deviceStreams[interactionLoc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[interactionLoc];
 
     int numAtoms = grid.srcAtomEnd() - grid.srcAtomBegin();
     // avoid empty kernel launch, skip to inserting stream dependency
@@ -901,7 +901,7 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality                         atomLo
     GMX_ASSERT(totalForcesDevice, "Need a valid totalForcesDevice pointer");
 
     const InteractionLocality iLocality    = gpuAtomToInteractionLocality(atomLocality);
-    const DeviceStream&       deviceStream = nb->deviceStreams[iLocality];
+    const DeviceStream&       deviceStream = *nb->deviceStreams[iLocality];
     cu_atomdata_t*            adat         = nb->atdat;
 
     size_t gmx_used_in_debug numDependency = static_cast<size_t>((useGpuFPmeReduction == true))
index a76880b17ee1484e0b48c2649b8b6b1ada46749c..6579d4100497e1784d483c4e942920e10f506a0d 100644 (file)
@@ -51,7 +51,7 @@
 
 // TODO Remove this comment when the above order issue is resolved
 #include "gromacs/gpu_utils/cudautils.cuh"
-#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/pmalloc_cuda.h"
@@ -413,16 +413,16 @@ static void cuda_init_const(NbnxmGpu*                       nb,
     nbnxn_cuda_clear_e_fshift(nb);
 }
 
-NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
-                   const interaction_const_t* ic,
-                   const PairlistParams&      listParams,
-                   const nbnxn_atomdata_t*    nbat,
-                   bool                       bLocalAndNonlocal)
+NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
+                   const interaction_const_t*      ic,
+                   const PairlistParams&           listParams,
+                   const nbnxn_atomdata_t*         nbat,
+                   bool                            bLocalAndNonlocal)
 {
     cudaError_t stat;
 
     auto nb            = new NbnxmGpu();
-    nb->deviceContext_ = &deviceContext;
+    nb->deviceContext_ = &deviceStreamManager.context();
     snew(nb->atdat, 1);
     snew(nb->nbparam, 1);
     snew(nb->plist[InteractionLocality::Local], 1);
@@ -444,8 +444,10 @@ NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
     init_plist(nb->plist[InteractionLocality::Local]);
 
     /* local/non-local GPU streams */
-    nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceContext_,
-                                                       DeviceStreamPriority::Normal, nb->bDoTime);
+    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
+                       "Local non-bonded stream should be initialized to use GPU for non-bonded.");
+    nb->deviceStreams[InteractionLocality::Local] =
+            &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
     if (nb->bUseTwoStreams)
     {
         init_plist(nb->plist[InteractionLocality::NonLocal]);
@@ -454,8 +456,12 @@ NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
          * priorities, because we are querying the priority range which in this
          * case will be a single value.
          */
-        nb->deviceStreams[InteractionLocality::NonLocal].init(
-                *nb->deviceContext_, DeviceStreamPriority::High, nb->bDoTime);
+        GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
+                           "Non-local non-bonded stream should be initialized to use GPU for "
+                           "non-bonded with domain decomposition.");
+        nb->deviceStreams[InteractionLocality::NonLocal] =
+                &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
+        ;
     }
 
     /* init events for sychronization (timing disabled for performance reasons!) */
@@ -504,7 +510,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
 {
     char                sbuf[STRLEN];
     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
     cu_plist_t*         d_plist      = nb->plist[iloc];
 
     if (d_plist->na_c < 0)
@@ -561,7 +567,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 {
     cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local].stream();
+    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
 
     /* only if we have a dynamic box */
     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
@@ -576,7 +582,7 @@ static void nbnxn_cuda_clear_f(NbnxmGpu* nb, int natoms_clear)
 {
     cudaError_t    stat;
     cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local].stream();
+    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
 
     stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
     CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
@@ -587,7 +593,7 @@ static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb)
 {
     cudaError_t    stat;
     cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local].stream();
+    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
 
     stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
     CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
@@ -616,7 +622,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
     bool                bDoTime      = nb->bDoTime;
     cu_timers_t*        timers       = nb->timers;
     cu_atomdata_t*      d_atdat      = nb->atdat;
-    const DeviceStream& deviceStream = nb->deviceStreams[InteractionLocality::Local];
+    const DeviceStream& deviceStream = *nb->deviceStreams[InteractionLocality::Local];
 
     natoms    = nbat->numAtoms();
     realloced = false;
@@ -806,13 +812,6 @@ gmx_bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
     return ((nb->nbparam->eeltype == eelCuEWALD_ANA) || (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
 }
 
-const DeviceStream* gpu_get_command_stream(NbnxmGpu* nb, const InteractionLocality iloc)
-{
-    assert(nb);
-
-    return &nb->deviceStreams[iloc];
-}
-
 void* gpu_get_xq(NbnxmGpu* nb)
 {
     assert(nb);
@@ -838,7 +837,7 @@ DeviceBuffer<gmx::RVec> gpu_get_fshift(NbnxmGpu* nb)
 /* TODO  Remove explicit pinning from host arrays from here and manage in a more natural way*/
 void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
 {
-    const DeviceStream& deviceStream  = gpu_nbv->deviceStreams[InteractionLocality::Local];
+    const DeviceStream& deviceStream  = *gpu_nbv->deviceStreams[InteractionLocality::Local];
     bool                bDoTime       = gpu_nbv->bDoTime;
     const int           maxNumColumns = gridSet.numColumnsMax();
 
@@ -929,7 +928,7 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int*                  cell,
                                     GpuEventSynchronizer* const localReductionDone)
 {
 
-    const DeviceStream& deviceStream = gpu_nbv->deviceStreams[InteractionLocality::Local];
+    const DeviceStream& deviceStream = *gpu_nbv->deviceStreams[InteractionLocality::Local];
 
     GMX_ASSERT(localReductionDone, "localReductionDone should be a valid pointer");
     gpu_nbv->localFReductionDone = localReductionDone;
index de5241a5feec681eaf81e05b850f26a47d28b159..68d5da81c4b7bcad19f6e201dc41de7bd504648a 100644 (file)
@@ -306,7 +306,7 @@ struct NbnxmGpu
     /*! \brief staging area where fshift/energies get downloaded */
     nb_staging_t nbst;
     /*! \brief local and non-local GPU streams */
-    gmx::EnumerationArray<Nbnxm::InteractionLocality, DeviceStream> deviceStreams;
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
 
     /*! \brief Events used for synchronization */
     /*! \{ */
index f5b3d813da09e7043e864a497d17994730aaa2cb..01f257e54f08fd787bdba775ceeeced859ef5ba1 100644 (file)
@@ -399,7 +399,7 @@ bool gpu_try_finish_task(NbnxmGpu*                nb,
             // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
             wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
 
-            if (!haveStreamTasksCompleted(nb->deviceStreams[iLocality]))
+            if (!haveStreamTasksCompleted(*nb->deviceStreams[iLocality]))
             {
                 wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
 
@@ -412,7 +412,7 @@ bool gpu_try_finish_task(NbnxmGpu*                nb,
         }
         else if (haveResultToWaitFor)
         {
-            nb->deviceStreams[iLocality].synchronize();
+            nb->deviceStreams[iLocality]->synchronize();
         }
 
         // TODO: this needs to be moved later because conditional wait could brake timing
index 8e114d1c65cb9affd4c5c9ee77f60dfd21c03168..21fc8174b56d494617b6e47ba5251563db467b05 100644 (file)
@@ -50,9 +50,6 @@
 #include "gromacs/gpu_utils/gpu_macros.h"
 #include "gromacs/mdtypes/locality.h"
 
-class DeviceContext;
-class DeviceStream;
-
 struct NbnxmGpu;
 struct gmx_gpu_info_t;
 struct DeviceInformation;
@@ -62,12 +59,19 @@ struct NbnxnPairlistGpu;
 struct PairlistParams;
 struct interaction_const_t;
 
+class DeviceStream;
+
+namespace gmx
+{
+class DeviceStreamManager;
+}
+
 namespace Nbnxm
 {
 
 /** Initializes the data structures related to GPU nonbonded calculations. */
 GPU_FUNC_QUALIFIER
-NbnxmGpu* gpu_init(const DeviceContext gmx_unused& deviceContext,
+NbnxmGpu* gpu_init(const gmx::DeviceStreamManager gmx_unused& deviceStreamManager,
                    const interaction_const_t gmx_unused* ic,
                    const PairlistParams gmx_unused& listParams,
                    const nbnxn_atomdata_t gmx_unused* nbat,
index a15f646ed98eb38925e8c4671397701965d63700..d5ced753f7cc2b2f2d8dac8d3e4508a486b39d28 100644 (file)
 #include "gromacs/utility/enumerationhelpers.h"
 #include "gromacs/utility/real.h"
 
-class DeviceContext;
 struct DeviceInformation;
 struct gmx_domdec_zones_t;
 struct gmx_enerdata_t;
@@ -144,6 +143,7 @@ class GpuEventSynchronizer;
 
 namespace gmx
 {
+class DeviceStreamManager;
 class ForceWithShiftForces;
 class GpuBonded;
 template<typename>
@@ -404,16 +404,16 @@ namespace Nbnxm
 {
 
 /*! \brief Creates an Nbnxm object */
-std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlog,
-                                                   const t_inputrec*        ir,
-                                                   const t_forcerec*        fr,
-                                                   const t_commrec*         cr,
-                                                   const gmx_hw_info_t&     hardwareInfo,
-                                                   const DeviceInformation* deviceInfo,
-                                                   const DeviceContext*     deviceContext,
-                                                   const gmx_mtop_t*        mtop,
-                                                   matrix                   box,
-                                                   gmx_wallcycle*           wcycle);
+std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger& mdlog,
+                                                   const t_inputrec*    ir,
+                                                   const t_forcerec*    fr,
+                                                   const t_commrec*     cr,
+                                                   const gmx_hw_info_t& hardwareInfo,
+                                                   bool                 useGpuForNonbonded,
+                                                   const gmx::DeviceStreamManager* deviceStreamManager,
+                                                   const gmx_mtop_t*               mtop,
+                                                   matrix                          box,
+                                                   gmx_wallcycle*                  wcycle);
 
 } // namespace Nbnxm
 
index d65c59c91d9f07ec1fa08d8271f7c385ce366df5..ea90c12b0c7e713c16777d492c0a0409a554a0e3 100644 (file)
@@ -358,25 +358,24 @@ static int getMinimumIlistCountForGpuBalancing(NbnxmGpu* nbnxmGpu)
     return minimumIlistCount;
 }
 
-std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlog,
-                                                   const t_inputrec*        ir,
-                                                   const t_forcerec*        fr,
-                                                   const t_commrec*         cr,
-                                                   const gmx_hw_info_t&     hardwareInfo,
-                                                   const DeviceInformation* deviceInfo,
-                                                   const DeviceContext*     deviceContext,
-                                                   const gmx_mtop_t*        mtop,
-                                                   matrix                   box,
-                                                   gmx_wallcycle*           wcycle)
+std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger& mdlog,
+                                                   const t_inputrec*    ir,
+                                                   const t_forcerec*    fr,
+                                                   const t_commrec*     cr,
+                                                   const gmx_hw_info_t& hardwareInfo,
+                                                   const bool           useGpuForNonbonded,
+                                                   const gmx::DeviceStreamManager* deviceStreamManager,
+                                                   const gmx_mtop_t*               mtop,
+                                                   matrix                          box,
+                                                   gmx_wallcycle*                  wcycle)
 {
     const bool emulateGpu = (getenv("GMX_EMULATE_GPU") != nullptr);
-    const bool useGpu     = deviceInfo != nullptr;
 
-    GMX_RELEASE_ASSERT(!(emulateGpu && useGpu),
+    GMX_RELEASE_ASSERT(!(emulateGpu && useGpuForNonbonded),
                        "When GPU emulation is active, there cannot be a GPU assignment");
 
     NonbondedResource nonbondedResource;
-    if (useGpu)
+    if (useGpuForNonbonded)
     {
         nonbondedResource = NonbondedResource::Gpu;
     }
@@ -425,7 +424,8 @@ std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlo
         enbnxninitcombrule = enbnxninitcombruleNONE;
     }
 
-    auto pinPolicy = (useGpu ? gmx::PinningPolicy::PinnedIfSupported : gmx::PinningPolicy::CannotBePinned);
+    auto pinPolicy = (useGpuForNonbonded ? gmx::PinningPolicy::PinnedIfSupported
+                                         : gmx::PinningPolicy::CannotBePinned);
 
     auto nbat = std::make_unique<nbnxn_atomdata_t>(pinPolicy);
 
@@ -440,18 +440,18 @@ std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlo
     }
     nbnxn_atomdata_init(mdlog, nbat.get(), kernelSetup.kernelType, enbnxninitcombrule, fr->ntype,
                         fr->nbfp, mimimumNumEnergyGroupNonbonded,
-                        (useGpu || emulateGpu) ? 1 : gmx_omp_nthreads_get(emntNonbonded));
+                        (useGpuForNonbonded || emulateGpu) ? 1 : gmx_omp_nthreads_get(emntNonbonded));
 
     NbnxmGpu* gpu_nbv                          = nullptr;
     int       minimumIlistCountForGpuBalancing = 0;
-    if (useGpu)
+    if (useGpuForNonbonded)
     {
-        GMX_RELEASE_ASSERT(
-                deviceContext != nullptr,
-                "Device context can not be nullptr when to use GPU for non-bonded forces.");
         /* init the NxN GPU data; the last argument tells whether we'll have
          * both local and non-local NB calculation on GPU */
-        gpu_nbv = gpu_init(*deviceContext, fr->ic, pairlistParams, nbat.get(), haveMultipleDomains);
+        GMX_RELEASE_ASSERT(
+                (deviceStreamManager != nullptr),
+                "Device stream manager should be initialized in order to use GPU for non-bonded.");
+        gpu_nbv = gpu_init(*deviceStreamManager, fr->ic, pairlistParams, nbat.get(), haveMultipleDomains);
 
         minimumIlistCountForGpuBalancing = getMinimumIlistCountForGpuBalancing(gpu_nbv);
     }
index ca6d9e4b197c80a515665cc2fffb171c9a69f9c2..a35a188400028bc6cedba05ea6044971cd57f947 100644 (file)
@@ -487,7 +487,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     cl_atomdata_t*      adat         = nb->atdat;
     cl_plist_t*         plist        = nb->plist[iloc];
     cl_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
 
@@ -587,7 +587,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
     cl_nbparam_t*       nbp          = nb->nbparam;
     cl_plist_t*         plist        = nb->plist[iloc];
     cl_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
 
@@ -725,7 +725,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     cl_nbparam_t*       nbp          = nb->nbparam;
     cl_plist_t*         plist        = nb->plist[iloc];
     cl_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
     bool                bDoTime      = nb->bDoTime;
 
     if (plist->haveFreshList)
@@ -862,7 +862,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     cl_atomdata_t*      adat         = nb->atdat;
     cl_timers_t*        t            = nb->timers;
     bool                bDoTime      = nb->bDoTime;
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     /* don't launch non-local copy-back if there was no non-local work to do */
     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
index f11aa2d807b357d9ca2b9ef56e9d764fa8b56421..bc913e0e24d6f2e45bf3e8b89c791f4aae1d7642 100644 (file)
@@ -52,6 +52,7 @@
 
 #include <cmath>
 
+#include "gromacs/gpu_utils/device_stream_manager.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/gpu_utils/oclutils.h"
 #include "gromacs/hardware/gpu_hw_info.h"
@@ -485,7 +486,7 @@ static void nbnxn_ocl_clear_e_fshift(NbnxmGpu* nb)
 
     cl_int           cl_error;
     cl_atomdata_t*   adat = nb->atdat;
-    cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local].stream();
+    cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
 
     size_t local_work_size[3]  = { 1, 1, 1 };
     size_t global_work_size[3] = { 1, 1, 1 };
@@ -555,16 +556,16 @@ static void nbnxn_ocl_init_const(cl_atomdata_t*                  atomData,
 
 
 //! This function is documented in the header file
-NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
-                   const interaction_const_t* ic,
-                   const PairlistParams&      listParams,
-                   const nbnxn_atomdata_t*    nbat,
-                   const bool                 bLocalAndNonlocal)
+NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
+                   const interaction_const_t*      ic,
+                   const PairlistParams&           listParams,
+                   const nbnxn_atomdata_t*         nbat,
+                   const bool                      bLocalAndNonlocal)
 {
     GMX_ASSERT(ic, "Need a valid interaction constants object");
 
     auto nb            = new NbnxmGpu();
-    nb->deviceContext_ = &deviceContext;
+    nb->deviceContext_ = &deviceStreamManager.context();
     snew(nb->atdat, 1);
     snew(nb->nbparam, 1);
     snew(nb->plist[InteractionLocality::Local], 1);
@@ -578,6 +579,7 @@ NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
     nb->timers = new cl_timers_t();
     snew(nb->timings, 1);
 
+    /* set device info, just point it to the right GPU among the detected ones */
     nb->dev_rundata = new gmx_device_runtime_data_t();
 
     /* init nbst */
@@ -591,15 +593,20 @@ NbnxmGpu* gpu_init(const DeviceContext&       deviceContext,
     nb->bDoTime = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
 
     /* local/non-local GPU streams */
-    nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceContext_,
-                                                       DeviceStreamPriority::Normal, nb->bDoTime);
+    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
+                       "Local non-bonded stream should be initialized to use GPU for non-bonded.");
+    nb->deviceStreams[InteractionLocality::Local] =
+            &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
 
     if (nb->bUseTwoStreams)
     {
         init_plist(nb->plist[InteractionLocality::NonLocal]);
 
-        nb->deviceStreams[InteractionLocality::NonLocal].init(
-                *nb->deviceContext_, DeviceStreamPriority::High, nb->bDoTime);
+        GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
+                           "Non-local non-bonded stream should be initialized to use GPU for "
+                           "non-bonded with domain decomposition.");
+        nb->deviceStreams[InteractionLocality::NonLocal] =
+                &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
     }
 
     if (nb->bDoTime)
@@ -647,7 +654,7 @@ static void nbnxn_ocl_clear_f(NbnxmGpu* nb, int natoms_clear)
     cl_int gmx_used_in_debug cl_error;
 
     cl_atomdata_t*   atomData = nb->atdat;
-    cl_command_queue ls       = nb->deviceStreams[InteractionLocality::Local].stream();
+    cl_command_queue ls       = nb->deviceStreams[InteractionLocality::Local]->stream();
     cl_float         value    = 0.0F;
 
     cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float), 0,
@@ -669,7 +676,7 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
 
     /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
     cl_int gmx_unused cl_error;
-    cl_error = clFlush(nb->deviceStreams[InteractionLocality::Local].stream());
+    cl_error = clFlush(nb->deviceStreams[InteractionLocality::Local]->stream());
     GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
 }
 
@@ -681,7 +688,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
     // because getLastRangeTime() gets skipped with empty lists later
     // which leads to the counter not being reset.
     bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
-    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
     cl_plist_t*         d_plist      = nb->plist[iloc];
 
     if (d_plist->na_c < 0)
@@ -740,7 +747,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 {
     cl_atomdata_t*   adat = nb->atdat;
-    cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local].stream();
+    cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
 
     /* only if we have a dynamic box */
     if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
@@ -760,7 +767,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
     bool                bDoTime      = nb->bDoTime;
     cl_timers_t*        timers       = nb->timers;
     cl_atomdata_t*      d_atdat      = nb->atdat;
-    const DeviceStream& deviceStream = nb->deviceStreams[InteractionLocality::Local];
+    const DeviceStream& deviceStream = *nb->deviceStreams[InteractionLocality::Local];
 
     natoms    = nbat->numAtoms();
     realloced = false;
index 67029075239f5dc9e47535d0ef8c57961e3eb6dd..cda229478339145739f29b38b61b709e74a1f50f 100644 (file)
@@ -364,7 +364,7 @@ struct NbnxmGpu
     nb_staging_t nbst;
 
     //! local and non-local GPU queues
-    gmx::EnumerationArray<Nbnxm::InteractionLocality, DeviceStream> deviceStreams;
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
 
     /*! \brief Events used for synchronization */
     /*! \{ */