Make DeviceStream into a class

[alexxy/gromacs.git] / src / gromacs / nbnxm / cuda / nbnxm_cuda_data_mgmt.cu
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu

index 666aefc62910b21babf656313f8bd17af9c25385..cbd5f8ce0197b3b434f06c58846f5d392f5384c6 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -448,7 +448,9 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
      nb->deviceInfo = deviceInfo;
  
      /* local/non-local GPU streams */
-    stat = cudaStreamCreate(&nb->stream[InteractionLocality::Local]);
+    cudaStream_t localStream;
+    stat = cudaStreamCreate(&localStream);
+    nb->deviceStreams[InteractionLocality::Local].setStream(localStream);
      CU_RET_ERR(stat, "cudaStreamCreate on stream[InterationLocality::Local] failed");
      if (nb->bUseTwoStreams)
      {
@@ -462,8 +464,9 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
          stat = cudaDeviceGetStreamPriorityRange(nullptr, &highest_priority);
          CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed");
  
-        stat = cudaStreamCreateWithPriority(&nb->stream[InteractionLocality::NonLocal],
-                                            cudaStreamDefault, highest_priority);
+        cudaStream_t nonLocalStream;
+        stat = cudaStreamCreateWithPriority(&nonLocalStream, cudaStreamDefault, highest_priority);
+        nb->deviceStreams[InteractionLocality::NonLocal].setStream(nonLocalStream);
          CU_RET_ERR(stat,
                     "cudaStreamCreateWithPriority on stream[InteractionLocality::NonLocal] failed");
      }
@@ -512,10 +515,10 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo,
  
  void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
  {
-    char         sbuf[STRLEN];
-    bool         bDoTime = (nb->bDoTime && !h_plist->sci.empty());
-    cudaStream_t stream  = nb->stream[iloc];
-    cu_plist_t*  d_plist = nb->plist[iloc];
+    char                sbuf[STRLEN];
+    bool                bDoTime      = (nb->bDoTime && !h_plist->sci.empty());
+    const DeviceStream& deviceStream = nb->deviceStreams[iloc];
+    cu_plist_t*         d_plist      = nb->plist[iloc];
  
      if (d_plist->na_c < 0)
      {
@@ -535,18 +538,18 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
  
      if (bDoTime)
      {
-        iTimers.pl_h2d.openTimingRegion(stream);
+        iTimers.pl_h2d.openTimingRegion(deviceStream);
          iTimers.didPairlistH2D = true;
      }
  
      reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
                             DeviceContext());
-    copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream,
+    copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), deviceStream,
                         GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
      reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc,
                             DeviceContext());
-    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream,
+    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), deviceStream,
                         GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
      reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
@@ -554,12 +557,12 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
  
      reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl,
                             &d_plist->excl_nalloc, DeviceContext());
-    copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream,
+    copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), deviceStream,
                         GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
      if (bDoTime)
      {
-        iTimers.pl_h2d.closeTimingRegion(stream);
+        iTimers.pl_h2d.closeTimingRegion(deviceStream);
      }
  
      /* the next use of thist list we be the first one, so we need to prune */
@@ -569,7 +572,7 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
  void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
  {
      cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->stream[InteractionLocality::Local];
+    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local].stream();
  
      /* only if we have a dynamic box */
      if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
@@ -584,7 +587,7 @@ static void nbnxn_cuda_clear_f(NbnxmGpu* nb, int natoms_clear)
  {
      cudaError_t    stat;
      cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->stream[InteractionLocality::Local];
+    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local].stream();
  
      stat = cudaMemsetAsync(adat->f, 0, natoms_clear * sizeof(*adat->f), ls);
      CU_RET_ERR(stat, "cudaMemsetAsync on f falied");
@@ -595,7 +598,7 @@ static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb)
  {
      cudaError_t    stat;
      cu_atomdata_t* adat = nb->atdat;
-    cudaStream_t   ls   = nb->stream[InteractionLocality::Local];
+    cudaStream_t   ls   = nb->deviceStreams[InteractionLocality::Local].stream();
  
      stat = cudaMemsetAsync(adat->fshift, 0, SHIFTS * sizeof(*adat->fshift), ls);
      CU_RET_ERR(stat, "cudaMemsetAsync on fshift falied");
@@ -618,13 +621,13 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
  
  void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
  {
-    cudaError_t    stat;
-    int            nalloc, natoms;
-    bool           realloced;
-    bool           bDoTime = nb->bDoTime;
-    cu_timers_t*   timers  = nb->timers;
-    cu_atomdata_t* d_atdat = nb->atdat;
-    cudaStream_t   ls      = nb->stream[InteractionLocality::Local];
+    cudaError_t         stat;
+    int                 nalloc, natoms;
+    bool                realloced;
+    bool                bDoTime      = nb->bDoTime;
+    cu_timers_t*        timers       = nb->timers;
+    cu_atomdata_t*      d_atdat      = nb->atdat;
+    const DeviceStream& deviceStream = nb->deviceStreams[InteractionLocality::Local];
  
      natoms    = nbat->numAtoms();
      realloced = false;
@@ -632,7 +635,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
      if (bDoTime)
      {
          /* time async copy */
-        timers->atdat.openTimingRegion(ls);
+        timers->atdat.openTimingRegion(deviceStream);
      }
  
      /* need to reallocate if we have to copy more atoms than the amount of space
@@ -681,17 +684,17 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
      if (useLjCombRule(nb->nbparam))
      {
          cu_copy_H2D_async(d_atdat->lj_comb, nbat->params().lj_comb.data(),
-                          natoms * sizeof(*d_atdat->lj_comb), ls);
+                          natoms * sizeof(*d_atdat->lj_comb), deviceStream.stream());
      }
      else
      {
          cu_copy_H2D_async(d_atdat->atom_types, nbat->params().type.data(),
-                          natoms * sizeof(*d_atdat->atom_types), ls);
+                          natoms * sizeof(*d_atdat->atom_types), deviceStream.stream());
      }
  
      if (bDoTime)
      {
-        timers->atdat.closeTimingRegion(ls);
+        timers->atdat.closeTimingRegion(deviceStream);
      }
  }
  
@@ -725,15 +728,6 @@ void gpu_free(NbnxmGpu* nb)
      CU_RET_ERR(stat, "cudaEventDestroy failed on timers->misc_ops_and_local_H2D_done");
  
      delete nb->timers;
-    if (nb->bDoTime)
-    {
-        /* The non-local counters/stream (second in the array) are needed only with DD. */
-        for (int i = 0; i <= (nb->bUseTwoStreams ? 1 : 0); i++)
-        {
-            stat = cudaStreamDestroy(nb->stream[i]);
-            CU_RET_ERR(stat, "cudaStreamDestroy failed on stream");
-        }
-    }
  
      if (!useLjCombRule(nb->nbparam))
      {
@@ -822,11 +816,11 @@ gmx_bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
      return ((nb->nbparam->eeltype == eelCuEWALD_ANA) || (nb->nbparam->eeltype == eelCuEWALD_ANA_TWIN));
  }
  
-void* gpu_get_command_stream(NbnxmGpu* nb, const InteractionLocality iloc)
+const DeviceStream* gpu_get_command_stream(NbnxmGpu* nb, const InteractionLocality iloc)
  {
      assert(nb);
  
-    return static_cast<void*>(&nb->stream[iloc]);
+    return &nb->deviceStreams[iloc];
  }
  
  void* gpu_get_xq(NbnxmGpu* nb)
@@ -854,9 +848,9 @@ DeviceBuffer<gmx::RVec> gpu_get_fshift(NbnxmGpu* nb)
  /* TODO  Remove explicit pinning from host arrays from here and manage in a more natural way*/
  void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
  {
-    cudaStream_t stream        = gpu_nbv->stream[InteractionLocality::Local];
-    bool         bDoTime       = gpu_nbv->bDoTime;
-    const int    maxNumColumns = gridSet.numColumnsMax();
+    const DeviceStream& deviceStream  = gpu_nbv->deviceStreams[InteractionLocality::Local];
+    bool                bDoTime       = gpu_nbv->bDoTime;
+    const int           maxNumColumns = gridSet.numColumnsMax();
  
      reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns * gridSet.grids().size(),
                             &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, DeviceContext());
@@ -882,15 +876,15 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv
  
              if (bDoTime)
              {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(deviceStream);
              }
  
-            copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, stream,
+            copyToDeviceBuffer(&gpu_nbv->atomIndices, atomIndices, 0, atomIndicesSize, deviceStream,
                                 GpuApiCallBehavior::Async, nullptr);
  
              if (bDoTime)
              {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(deviceStream);
              }
          }
  
@@ -898,28 +892,30 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv
          {
              if (bDoTime)
              {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(deviceStream);
              }
  
              int* destPtr = &gpu_nbv->cxy_na[maxNumColumns * g];
-            copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+            copyToDeviceBuffer(&destPtr, cxy_na, 0, numColumns, deviceStream,
+                               GpuApiCallBehavior::Async, nullptr);
  
              if (bDoTime)
              {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(deviceStream);
              }
  
              if (bDoTime)
              {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.openTimingRegion(deviceStream);
              }
  
              destPtr = &gpu_nbv->cxy_ind[maxNumColumns * g];
-            copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, stream, GpuApiCallBehavior::Async, nullptr);
+            copyToDeviceBuffer(&destPtr, cxy_ind, 0, numColumns, deviceStream,
+                               GpuApiCallBehavior::Async, nullptr);
  
              if (bDoTime)
              {
-                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(stream);
+                gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d.closeTimingRegion(deviceStream);
              }
          }
      }
@@ -943,7 +939,7 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int*                  cell,
                                      GpuEventSynchronizer* const localReductionDone)
  {
  
-    cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local];
+    const DeviceStream& deviceStream = gpu_nbv->deviceStreams[InteractionLocality::Local];
  
      GMX_ASSERT(localReductionDone, "localReductionDone should be a valid pointer");
      gpu_nbv->localFReductionDone = localReductionDone;
@@ -952,7 +948,8 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int*                  cell,
      {
          reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc,
                                 DeviceContext());
-        copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, stream, GpuApiCallBehavior::Async, nullptr);
+        copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, deviceStream,
+                           GpuApiCallBehavior::Async, nullptr);
      }
  
      return;