From e3d904d007afc9545f95d72da51c7dc8d83dae54 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Thu, 20 Feb 2020 15:27:19 +0100
Subject: [PATCH] Use DeviceStream init(...) function to create streams

Change the stream creation procedures from direct calls to CUDA
and OpenCL API to using pre-defined init(...) method of the
DeviceStream class.

Refs #3314
Refs #3311

Change-Id: I96a0ca41f251b9925ef9bed77c4f355939b65c6d
---
 src/gromacs/ewald/pme_gpu_internal.cpp        | 30 ++-------------
 src/gromacs/ewald/pme_pp_comm_gpu_impl.cu     |  7 ++--
 .../state_propagator_data_gpu_impl_gpu.cpp    |  8 ++--
 .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu        | 20 +++-------
 src/gromacs/nbnxm/gpu_data_mgmt.h             |  1 -
 src/gromacs/nbnxm/nbnxm_setup.cpp             |  2 +-
 .../nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp      | 38 ++-----------------
 7 files changed, 21 insertions(+), 85 deletions(-)

diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp
index 822109de4c..ae308bf570 100644
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -524,32 +524,8 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu)
      * - default high priority with CUDA
      * - no priorities implemented yet with OpenCL; see #2532
      */
-#if GMX_GPU == GMX_GPU_CUDA
-    cudaError_t stat;
-    int         highest_priority, lowest_priority;
-    stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority);
-    CU_RET_ERR(stat, "PME cudaDeviceGetStreamPriorityRange failed");
-    cudaStream_t stream;
-    stat = cudaStreamCreateWithPriority(&stream,
-                                        cudaStreamDefault, // cudaStreamNonBlocking,
-                                        highest_priority);
-    pmeGpu->archSpecific->pmeStream_.setStream(stream);
-    CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed");
-#elif GMX_GPU == GMX_GPU_OPENCL
-    cl_command_queue_properties queueProperties =
-            pmeGpu->archSpecific->useTiming ? CL_QUEUE_PROFILING_ENABLE : 0;
-    cl_device_id device_id = pmeGpu->deviceInfo->oclDeviceId;
-    cl_int       clError;
-    pmeGpu->archSpecific->pmeStream_.setStream(clCreateCommandQueue(
-            pmeGpu->archSpecific->deviceContext_.context(), device_id, queueProperties, &clError));
-
-
-    if (clError != CL_SUCCESS)
-    {
-        GMX_THROW(gmx::InternalError(
-                gmx::formatString("Failed to create PME command queue (OpenCL error %d)", clError).c_str()));
-    }
-#endif
+    pmeGpu->archSpecific->pmeStream_.init(*pmeGpu->deviceInfo, pmeGpu->archSpecific->deviceContext_,
+                                          DeviceStreamPriority::High, pmeGpu->archSpecific->useTiming);
 }
 
 void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu)
@@ -812,6 +788,8 @@ static void pme_gpu_select_best_performing_pme_spreadgather_kernels(PmeGpu* pmeG
  */
 static void pme_gpu_init(gmx_pme_t* pme, const DeviceInformation* deviceInfo, const PmeGpuProgram* pmeGpuProgram)
 {
+    GMX_ASSERT(deviceInfo != nullptr,
+               "Device information can not be nullptr when GPU is used for PME.");
     pme->gpu       = new PmeGpu();
     PmeGpu* pmeGpu = pme->gpu;
     changePinningPolicy(&pmeGpu->staging.h_forces, pme_get_pinning_policy());
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
index f5aac2981a..32a752746a 100644
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -64,9 +64,10 @@ PmePpCommGpu::Impl::Impl(MPI_Comm comm, int pmeRank, const DeviceContext& device
     GMX_RELEASE_ASSERT(
             GMX_THREAD_MPI,
             "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
-    cudaStream_t stream;
-    cudaStreamCreate(&stream);
-    pmePpCommStream_.setStream(stream);
+
+    // In CUDA we only need priority to create stream.
+    // (note that this will be moved from here in the follow-up patch)
+    pmePpCommStream_.init(DeviceInformation(), DeviceContext(), DeviceStreamPriority::Normal, false);
 }
 
 PmePpCommGpu::Impl::~Impl() = default;
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
index b1fefd34a5..d0027852ee 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -109,12 +109,10 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
 
         // TODO: The update stream should be created only when it is needed.
 #    if (GMX_GPU == GMX_GPU_CUDA)
-        cudaError_t  stat;
-        cudaStream_t stream;
-        stat = cudaStreamCreate(&stream);
-        updateStreamOwn_.setStream(stream);
+        // In CUDA we only need priority to create stream.
+        // (note that this will be moved from here in the follow-up patch)
+        updateStreamOwn_.init(DeviceInformation(), DeviceContext(), DeviceStreamPriority::Normal, false);
         updateStream_ = &updateStreamOwn_;
-        CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu");
 #    endif
     }
 
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index cbd5f8ce01..36342b935f 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -418,8 +418,7 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
                    const interaction_const_t* ic,
                    const PairlistParams&      listParams,
                    const nbnxn_atomdata_t*    nbat,
-                   int /*rank*/,
-                   bool bLocalAndNonlocal)
+                   bool                       bLocalAndNonlocal)
 {
     cudaError_t stat;
 
@@ -448,10 +447,8 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
     nb->deviceInfo = deviceInfo;
 
     /* local/non-local GPU streams */
-    cudaStream_t localStream;
-    stat = cudaStreamCreate(&localStream);
-    nb->deviceStreams[InteractionLocality::Local].setStream(localStream);
-    CU_RET_ERR(stat, "cudaStreamCreate on stream[InterationLocality::Local] failed");
+    nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceInfo, DeviceContext(),
+                                                       DeviceStreamPriority::Normal, nb->bDoTime);
     if (nb->bUseTwoStreams)
     {
         init_plist(nb->plist[InteractionLocality::NonLocal]);
@@ -460,15 +457,8 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
          * priorities, because we are querying the priority range which in this
          * case will be a single value.
          */
-        int highest_priority;
-        stat = cudaDeviceGetStreamPriorityRange(nullptr, &highest_priority);
-        CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
-
-        cudaStream_t nonLocalStream;
-        stat = cudaStreamCreateWithPriority(&nonLocalStream, cudaStreamDefault, highest_priority);
-        nb->deviceStreams[InteractionLocality::NonLocal].setStream(nonLocalStream);
-        CU_RET_ERR(stat,
-                   "cudaStreamCreateWithPriority on stream[InteractionLocality::NonLocal] failed");
+        nb->deviceStreams[InteractionLocality::NonLocal].init(
+                *nb->deviceInfo, DeviceContext(), DeviceStreamPriority::High, nb->bDoTime);
     }
 
     /* init events for sychronization (timing disabled for performance reasons!) */
diff --git a/src/gromacs/nbnxm/gpu_data_mgmt.h b/src/gromacs/nbnxm/gpu_data_mgmt.h
index e242771862..574588b39a 100644
--- a/src/gromacs/nbnxm/gpu_data_mgmt.h
+++ b/src/gromacs/nbnxm/gpu_data_mgmt.h
@@ -72,7 +72,6 @@ NbnxmGpu* gpu_init(const DeviceInformation gmx_unused* deviceInfo,
                    const interaction_const_t gmx_unused* ic,
                    const PairlistParams gmx_unused& listParams,
                    const nbnxn_atomdata_t gmx_unused* nbat,
-                   int gmx_unused rank,
                    /* true if both local and non-local are done on GPU */
                    bool gmx_unused bLocalAndNonlocal) GPU_FUNC_TERM_WITH_RETURN(nullptr);
 
diff --git a/src/gromacs/nbnxm/nbnxm_setup.cpp b/src/gromacs/nbnxm/nbnxm_setup.cpp
index f7c7f6dd16..d854ede572 100644
--- a/src/gromacs/nbnxm/nbnxm_setup.cpp
+++ b/src/gromacs/nbnxm/nbnxm_setup.cpp
@@ -452,7 +452,7 @@ std::unique_ptr<nonbonded_verlet_t> init_nb_verlet(const gmx::MDLogger&     mdlo
         /* init the NxN GPU data; the last argument tells whether we'll have
          * both local and non-local NB calculation on GPU */
         gpu_nbv = gpu_init(deviceInfo, *deviceContext, fr->ic, pairlistParams, nbat.get(),
-                           cr->nodeid, haveMultipleDomains);
+                           haveMultipleDomains);
 
         minimumIlistCountForGpuBalancing = getMinimumIlistCountForGpuBalancing(gpu_nbv);
     }
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
index fa37263a5b..adc60b0a39 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -560,12 +560,8 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
                    const interaction_const_t* ic,
                    const PairlistParams&      listParams,
                    const nbnxn_atomdata_t*    nbat,
-                   const int                  rank,
                    const bool                 bLocalAndNonlocal)
 {
-    cl_int                      cl_error;
-    cl_command_queue_properties queue_properties;
-
     GMX_ASSERT(ic, "Need a valid interaction constants object");
 
     auto nb = new NbnxmGpu;
@@ -596,42 +592,16 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
     /* OpenCL timing disabled if GMX_DISABLE_GPU_TIMING is defined. */
     nb->bDoTime = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
 
-    /* Create queues only after bDoTime has been initialized */
-    if (nb->bDoTime)
-    {
-        queue_properties = CL_QUEUE_PROFILING_ENABLE;
-    }
-    else
-    {
-        queue_properties = 0;
-    }
-
-    cl_command_queue localStream =
-            clCreateCommandQueue(nb->dev_rundata->deviceContext_.context(),
-                                 nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
     /* local/non-local GPU streams */
-    nb->deviceStreams[InteractionLocality::Local].setStream(localStream);
-
-    if (CL_SUCCESS != cl_error)
-    {
-        gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d", rank,
-                  nb->deviceInfo->device_name, cl_error);
-    }
+    nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceInfo, nb->dev_rundata->deviceContext_,
+                                                       DeviceStreamPriority::Normal, nb->bDoTime);
 
     if (nb->bUseTwoStreams)
     {
         init_plist(nb->plist[InteractionLocality::NonLocal]);
 
-        cl_command_queue nonLocalStream =
-                clCreateCommandQueue(nb->dev_rundata->deviceContext_.context(),
-                                     nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
-        nb->deviceStreams[InteractionLocality::NonLocal].setStream(nonLocalStream);
-
-        if (CL_SUCCESS != cl_error)
-        {
-            gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
-                      rank, nb->deviceInfo->device_name, cl_error);
-        }
+        nb->deviceStreams[InteractionLocality::NonLocal].init(
+                *nb->deviceInfo, nb->dev_rundata->deviceContext_, DeviceStreamPriority::High, nb->bDoTime);
     }
 
     if (nb->bDoTime)
-- 
2.22.0