Make DeviceContext into a proper class

author Artem Zhmurov <zhmurov@gmail.com>

Wed, 29 Jan 2020 15:59:33 +0000 (16:59 +0100)

committer Christian Blau <cblau@gerrit.gromacs.org>

Mon, 9 Mar 2020 17:33:41 +0000 (18:33 +0100)
author Artem Zhmurov <zhmurov@gmail.com>
Wed, 29 Jan 2020 15:59:33 +0000 (16:59 +0100)
committer Christian Blau <cblau@gerrit.gromacs.org>
Mon, 9 Mar 2020 17:33:41 +0000 (18:33 +0100)
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu

index 2ac4228d6f2a6f4c3b3f625d671f6b71ae94ebe8..92a1d9f3d5d2d2235099eb0ae230e01331da19e0 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -147,9 +147,9 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
      // reallocate on device only if needed
      if (newSize > maxPackedBufferSize_)
      {
-        reallocateDeviceBuffer(&d_indexMap_, newSize, &indexMapSize_, &indexMapSizeAlloc_, nullptr);
-        reallocateDeviceBuffer(&d_sendBuf_, newSize, &sendBufSize_, &sendBufSizeAlloc_, nullptr);
-        reallocateDeviceBuffer(&d_recvBuf_, newSize, &recvBufSize_, &recvBufSizeAlloc_, nullptr);
+        reallocateDeviceBuffer(&d_indexMap_, newSize, &indexMapSize_, &indexMapSizeAlloc_, deviceContext_);
+        reallocateDeviceBuffer(&d_sendBuf_, newSize, &sendBufSize_, &sendBufSizeAlloc_, deviceContext_);
+        reallocateDeviceBuffer(&d_recvBuf_, newSize, &recvBufSize_, &recvBufSizeAlloc_, deviceContext_);
          maxPackedBufferSize_ = newSize;
      }
  
@@ -448,7 +448,7 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd,
  
      changePinningPolicy(&h_indexMap_, gmx::PinningPolicy::PinnedIfSupported);
  
-    allocateDeviceBuffer(&d_fShift_, 1, nullptr);
+    allocateDeviceBuffer(&d_fShift_, 1, deviceContext_);
  }
  
  GpuHaloExchange::Impl::~Impl()
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh

index b139a9b491e2704b927a8663af948c4b843fe6dd..a8d2f9204c590894d489d266a221292bd3b6d056 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
@@ -47,6 +47,7 @@
  #define GMX_DOMDEC_GPUHALOEXCHANGE_IMPL_H
  
  #include "gromacs/domdec/gpuhaloexchange.h"
+#include "gromacs/gpu_utils/device_context.h"
  #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
  #include "gromacs/gpu_utils/hostallocator.h"
  #include "gromacs/utility/gmxmpi.h"
@@ -175,6 +176,8 @@ private:
      GpuEventSynchronizer* haloDataTransferLaunched_ = nullptr;
      //! MPI communicator used for simulation
      MPI_Comm mpi_comm_mysim_;
+    //! Dummy GPU context object
+    const DeviceContext deviceContext_;
      //! CUDA stream for local non-bonded calculations
      cudaStream_t localStream_ = nullptr;
      //! CUDA stream for non-local non-bonded calculations
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h

index ef89a5bb7925a641e75724346a01fcff42670f86..40a34682c00f173841f415722d165e038e44676c 100644 (file)
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -71,6 +71,7 @@ struct gmx_pme_t;
  struct gmx_wallcycle;
  struct NumPmeDomains;
  
+class DeviceContext;
  enum class GpuTaskCompletion;
  class PmeGpuProgram;
  class GpuEventSynchronizer;
@@ -439,7 +440,7 @@ GPU_FUNC_QUALIFIER void* pme_gpu_get_device_stream(const gmx_pme_t* GPU_FUNC_ARG
   *  \param[in] pme            The PME data structure.
   *  \returns                  Pointer to GPU context object.
   */
-GPU_FUNC_QUALIFIER void* pme_gpu_get_device_context(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
+GPU_FUNC_QUALIFIER const DeviceContext* pme_gpu_get_device_context(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme))
          GPU_FUNC_TERM_WITH_RETURN(nullptr);
  
  /*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp

index 7f092304b50e170731a74f4eb4accd42c2036c59..b4cec47135c8af61d2f66a368ba635290b4647ac 100644 (file)
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -442,12 +442,11 @@ void* pme_gpu_get_device_stream(const gmx_pme_t* pme)
      return pme_gpu_get_stream(pme->gpu);
  }
  
-void* pme_gpu_get_device_context(const gmx_pme_t* pme)
+const DeviceContext* pme_gpu_get_device_context(const gmx_pme_t* pme)
  {
-    if (!pme || !pme_gpu_active(pme))
-    {
-        return nullptr;
-    }
+    GMX_RELEASE_ASSERT(pme, "GPU context requested from PME before PME was constructed.");
+    GMX_RELEASE_ASSERT(pme_gpu_active(pme),
+                       "GPU context requested from PME, but PME is running on the CPU.");
      return pme_gpu_get_context(pme->gpu);
  }
  
diff --git a/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp b/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp

index 2b14dc7567b3849960f169dfcebef49b6475dce7..c6e1b6448cc6093985b6e664f07fc6609f57a37a 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp
+++ b/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp
@@ -80,7 +80,7 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu)
                             == kernelParamsPtr->grid.complexGridSize[i],
                     "Complex padding not implemented");
      }
-    cl_context context = pmeGpu->archSpecific->context;
+    cl_context context = pmeGpu->archSpecific->deviceContext_.context();
      commandStreams_.push_back(pmeGpu->archSpecific->pmeStream);
      realGrid_                       = kernelParamsPtr->grid.d_realGrid;
      complexGrid_                    = kernelParamsPtr->grid.d_fourierGrid;
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp

index f9932eb36b6eef317b89c90d61d058ef7d3dfebd..17dd6805c06d2d899583348a94f3468944307fff 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -142,7 +142,7 @@ void pme_gpu_alloc_energy_virial(PmeGpu* pmeGpu)
  {
      const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
      allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy, c_virialAndEnergyCount,
-                         pmeGpu->archSpecific->context);
+                         pmeGpu->archSpecific->deviceContext_);
      pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_virialAndEnergy), energyAndVirialSize);
  }
  
@@ -172,7 +172,8 @@ void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu)
      const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize);
      reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, newSplineValuesSize,
                             &pmeGpu->archSpecific->splineValuesSize,
-                           &pmeGpu->archSpecific->splineValuesSizeAlloc, pmeGpu->archSpecific->context);
+                           &pmeGpu->archSpecific->splineValuesSizeAlloc,
+                           pmeGpu->archSpecific->deviceContext_);
      if (shouldRealloc)
      {
          /* Reallocate the host buffer */
@@ -202,8 +203,8 @@ void pme_gpu_realloc_forces(PmeGpu* pmeGpu)
      const size_t newForcesSize = pmeGpu->nAtomsAlloc * DIM;
      GMX_ASSERT(newForcesSize > 0, "Bad number of atoms in PME GPU");
      reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_forces, newForcesSize,
-                           &pmeGpu->archSpecific->forcesSize,
-                           &pmeGpu->archSpecific->forcesSizeAlloc, pmeGpu->archSpecific->context);
+                           &pmeGpu->archSpecific->forcesSize, &pmeGpu->archSpecific->forcesSizeAlloc,
+                           pmeGpu->archSpecific->deviceContext_);
      pmeGpu->staging.h_forces.reserveWithPadding(pmeGpu->nAtomsAlloc);
      pmeGpu->staging.h_forces.resizeWithPadding(pmeGpu->kernelParams->atoms.nAtoms);
  }
@@ -238,7 +239,8 @@ void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_
      GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
      reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, newCoefficientsSize,
                             &pmeGpu->archSpecific->coefficientsSize,
-                           &pmeGpu->archSpecific->coefficientsSizeAlloc, pmeGpu->archSpecific->context);
+                           &pmeGpu->archSpecific->coefficientsSizeAlloc,
+                           pmeGpu->archSpecific->deviceContext_);
      copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients,
                         const_cast<float*>(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms,
                         pmeGpu->archSpecific->pmeStream, pmeGpu->settings.transferKind, nullptr);
@@ -270,11 +272,11 @@ void pme_gpu_realloc_spline_data(PmeGpu* pmeGpu)
      const bool shouldRealloc        = (newSplineDataSize > pmeGpu->archSpecific->splineDataSize);
      int        currentSizeTemp      = pmeGpu->archSpecific->splineDataSize;
      int        currentSizeTempAlloc = pmeGpu->archSpecific->splineDataSizeAlloc;
-    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize,
-                           &currentSizeTemp, &currentSizeTempAlloc, pmeGpu->archSpecific->context);
+    reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_theta, newSplineDataSize, &currentSizeTemp,
+                           &currentSizeTempAlloc, pmeGpu->archSpecific->deviceContext_);
      reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_dtheta, newSplineDataSize,
-                           &pmeGpu->archSpecific->splineDataSize,
-                           &pmeGpu->archSpecific->splineDataSizeAlloc, pmeGpu->archSpecific->context);
+                           &pmeGpu->archSpecific->splineDataSize, &pmeGpu->archSpecific->splineDataSizeAlloc,
+                           pmeGpu->archSpecific->deviceContext_);
      // the host side reallocation
      if (shouldRealloc)
      {
@@ -300,7 +302,8 @@ void pme_gpu_realloc_grid_indices(PmeGpu* pmeGpu)
      GMX_ASSERT(newIndicesSize > 0, "Bad number of atoms in PME GPU");
      reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_gridlineIndices, newIndicesSize,
                             &pmeGpu->archSpecific->gridlineIndicesSize,
-                           &pmeGpu->archSpecific->gridlineIndicesSizeAlloc, pmeGpu->archSpecific->context);
+                           &pmeGpu->archSpecific->gridlineIndicesSizeAlloc,
+                           pmeGpu->archSpecific->deviceContext_);
      pfree(pmeGpu->staging.h_gridlineIndices);
      pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_gridlineIndices), newIndicesSize * sizeof(int));
  }
@@ -326,10 +329,11 @@ void pme_gpu_realloc_grids(PmeGpu* pmeGpu)
          /* 2 separate grids */
          reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, newComplexGridSize,
                                 &pmeGpu->archSpecific->complexGridSize,
-                               &pmeGpu->archSpecific->complexGridSizeAlloc, pmeGpu->archSpecific->context);
-        reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid, newRealGridSize,
-                               &pmeGpu->archSpecific->realGridSize,
-                               &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->context);
+                               &pmeGpu->archSpecific->complexGridSizeAlloc,
+                               pmeGpu->archSpecific->deviceContext_);
+        reallocateDeviceBuffer(
+                &kernelParamsPtr->grid.d_realGrid, newRealGridSize, &pmeGpu->archSpecific->realGridSize,
+                &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->deviceContext_);
      }
      else
      {
@@ -337,7 +341,7 @@ void pme_gpu_realloc_grids(PmeGpu* pmeGpu)
          const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
          reallocateDeviceBuffer(
                  &kernelParamsPtr->grid.d_realGrid, newGridsSize, &pmeGpu->archSpecific->realGridSize,
-                &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->context);
+                &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->deviceContext_);
          kernelParamsPtr->grid.d_fourierGrid   = kernelParamsPtr->grid.d_realGrid;
          pmeGpu->archSpecific->complexGridSize = pmeGpu->archSpecific->realGridSize;
          // the size might get used later for copying the grid
@@ -385,9 +389,9 @@ void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu)
  #elif GMX_GPU == GMX_GPU_OPENCL
      // No dedicated texture routines....
      allocateDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, newFractShiftsSize,
-                         pmeGpu->archSpecific->context);
+                         pmeGpu->archSpecific->deviceContext_);
      allocateDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable, newFractShiftsSize,
-                         pmeGpu->archSpecific->context);
+                         pmeGpu->archSpecific->deviceContext_);
      copyToDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable, pmeGpu->common->fsh.data(), 0,
                         newFractShiftsSize, pmeGpu->archSpecific->pmeStream,
                         GpuApiCallBehavior::Async, nullptr);
@@ -486,7 +490,7 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu)
  #endif
  
      /* Allocate the target-specific structures */
-    pmeGpu->archSpecific.reset(new PmeGpuSpecific());
+    pmeGpu->archSpecific.reset(new PmeGpuSpecific(pmeGpu->programHandle_->impl_->deviceContext_));
      pmeGpu->kernelParams.reset(new PmeGpuKernelParams());
  
      pmeGpu->archSpecific->performOutOfPlaceFFT = true;
@@ -495,9 +499,6 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu)
       * TODO: PME could also try to pick up nice grid sizes (with factors of 2, 3, 5, 7).
       */
  
-    // TODO: this is just a convenient reuse because programHandle_ currently is in charge of creating context
-    pmeGpu->archSpecific->context = pmeGpu->programHandle_->impl_->context;
-
      // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?) and reuse in NB
      if (GMX_GPU == GMX_GPU_CUDA)
      {
@@ -537,8 +538,8 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu)
              pmeGpu->archSpecific->useTiming ? CL_QUEUE_PROFILING_ENABLE : 0;
      cl_device_id device_id = pmeGpu->deviceInfo->oclDeviceId;
      cl_int       clError;
-    pmeGpu->archSpecific->pmeStream =
-            clCreateCommandQueue(pmeGpu->archSpecific->context, device_id, queueProperties, &clError);
+    pmeGpu->archSpecific->pmeStream = clCreateCommandQueue(
+            pmeGpu->archSpecific->deviceContext_.context(), device_id, queueProperties, &clError);
      if (clError != CL_SUCCESS)
      {
          GMX_THROW(gmx::InternalError("Failed to create PME command queue"));
@@ -1526,16 +1527,12 @@ void* pme_gpu_get_stream(const PmeGpu* pmeGpu)
      }
  }
  
-void* pme_gpu_get_context(const PmeGpu* pmeGpu)
+const DeviceContext* pme_gpu_get_context(const PmeGpu* pmeGpu)
  {
-    if (pmeGpu)
-    {
-        return static_cast<void*>(&pmeGpu->archSpecific->context);
-    }
-    else
-    {
-        return nullptr;
-    }
+    GMX_RELEASE_ASSERT(
+            pmeGpu,
+            "GPU context object was requested, but PME GPU object was not (yet) initialized.");
+    return &pmeGpu->archSpecific->deviceContext_;
  }
  
  GpuEventSynchronizer* pme_gpu_get_forces_ready_synchronizer(const PmeGpu* pmeGpu)
diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h

index 0c5b53ab0ec94e88fba54b013e43ebdf45d27aa8..a9dc9677ce53a55405a43ce5564aa9c43d3fa96a 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_internal.h
+++ b/src/gromacs/ewald/pme_gpu_internal.h
@@ -412,7 +412,7 @@ GPU_FUNC_QUALIFIER void* pme_gpu_get_stream(const PmeGpu* GPU_FUNC_ARGUMENT(pmeG
   * \param[in] pmeGpu         The PME GPU structure.
   * \returns                  Pointer to context object.
   */
-GPU_FUNC_QUALIFIER void* pme_gpu_get_context(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
+GPU_FUNC_QUALIFIER const DeviceContext* pme_gpu_get_context(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu))
          GPU_FUNC_TERM_WITH_RETURN(nullptr);
  
  /*! \brief Return pointer to the sync object triggered after the PME force calculation completion
diff --git a/src/gromacs/ewald/pme_gpu_program_impl.h b/src/gromacs/ewald/pme_gpu_program_impl.h

index 2c79f20a71ba53e49827489b4d91f9b53fb32fb6..0b007c39665f55384211d753324d542aa205dbea 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_program_impl.h
+++ b/src/gromacs/ewald/pme_gpu_program_impl.h
@@ -44,6 +44,7 @@
  
  #include "config.h"
  
+#include "gromacs/gpu_utils/device_context.h"
  #include "gromacs/gpu_utils/gputraits.h"
  #include "gromacs/utility/classhelpers.h"
  
@@ -77,7 +78,7 @@ struct PmeGpuProgramImpl
       * TODO: Later we want to be able to own the context at a higher level and not here,
       * but this class would still need the non-owning context handle to build the kernels.
       */
-    DeviceContext context;
+    DeviceContext deviceContext_;
  
      //! Conveniently all the PME kernels use the same single argument type
  #if GMX_GPU == GMX_GPU_CUDA
diff --git a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp

index ec220c7c2a7983ef09375d68e4451dc50fa05ff3..1353a99ed2128487b7e32e4d41cccd05206cc413 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
+++ b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp
@@ -64,7 +64,7 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation* deviceInfo)
      contextProperties[2] = 0; /* Terminates the list of properties */
  
      cl_int clError;
-    context = clCreateContext(contextProperties, 1, &deviceId, nullptr, nullptr, &clError);
+    deviceContext_.setContext(clCreateContext(contextProperties, 1, &deviceId, nullptr, nullptr, &clError));
      if (clError != CL_SUCCESS)
      {
          const std::string errorString = gmx::formatString(
@@ -74,7 +74,7 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceInformation* deviceInfo)
      }
  
      // kernel parameters
-    warpSize = gmx::ocl::getDeviceWarpSize(context, deviceId);
+    warpSize = gmx::ocl::getDeviceWarpSize(deviceContext_.context(), deviceId);
      // TODO: for Intel ideally we'd want to set these based on the compiler warp size
      // but given that we've done no tuning for Intel iGPU, this is as good as anything.
      spreadWorkGroupSize = std::min(c_spreadMaxWarpsPerBlock * warpSize, deviceInfo->maxWorkGroupSize);
@@ -96,7 +96,6 @@ PmeGpuProgramImpl::~PmeGpuProgramImpl()
      stat |= clReleaseKernel(solveXYZEnergyKernel);
      stat |= clReleaseKernel(solveYZXKernel);
      stat |= clReleaseKernel(solveYZXEnergyKernel);
-    stat |= clReleaseContext(context);
      GMX_ASSERT(stat == CL_SUCCESS,
                 gmx::formatString("Failed to release PME OpenCL resources %d: %s", stat,
                                   ocl_get_error_string(stat).c_str())
@@ -165,8 +164,8 @@ void PmeGpuProgramImpl::compileKernels(const DeviceInformation* deviceInfo)
              /* TODO when we have a proper MPI-aware logging module,
                 the log output here should be written there */
              program = gmx::ocl::compileProgram(stderr, "gromacs/ewald", "pme_program.cl",
-                                               commonDefines, context, deviceInfo->oclDeviceId,
-                                               deviceInfo->deviceVendor);
+                                               commonDefines, deviceContext_.context(),
+                                               deviceInfo->oclDeviceId, deviceInfo->deviceVendor);
          }
          catch (gmx::GromacsException& e)
          {
diff --git a/src/gromacs/ewald/pme_gpu_types_host_impl.h b/src/gromacs/ewald/pme_gpu_types_host_impl.h

index be4f986957d84f257fd5fc3658886bcbb6579c82..44ca3fd3c3561c64b467e13527f59b0f8c78c627 100644 (file)
--- a/src/gromacs/ewald/pme_gpu_types_host_impl.h
+++ b/src/gromacs/ewald/pme_gpu_types_host_impl.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -60,6 +60,8 @@
  
  #include "gromacs/timing/gpu_timing.h" // for gtPME_EVENT_COUNT
  
+#include "pme_gpu_3dfft.h"
+
  class GpuParallel3dFft;
  
  /*! \internal \brief
@@ -67,6 +69,11 @@ class GpuParallel3dFft;
   */
  struct PmeGpuSpecific
  {
+    /*! \brief Constructor
+     *
+     * \param[in] deviceContext GPU device context.
+     */
+    PmeGpuSpecific(const DeviceContext& deviceContext) : deviceContext_(deviceContext) {}
      /*! \brief The GPU stream where everything related to the PME happens. */
      CommandStream pmeStream;
  
@@ -76,7 +83,7 @@ struct PmeGpuSpecific
       * but should be a constructor parameter to PmeGpu, as well as PmeGpuProgram,
       * managed by high-level code.
       */
-    DeviceContext context;
+    const DeviceContext& deviceContext_;
  
      /* Synchronization events */
      /*! \brief Triggered after the PME Force Calculations have been completed */
@@ -86,13 +93,13 @@ struct PmeGpuSpecific
  
      /* Settings which are set at the start of the run */
      /*! \brief A boolean which tells whether the complex and real grids for cu/clFFT are different or same. Currenty true. */
-    bool performOutOfPlaceFFT;
+    bool performOutOfPlaceFFT = false;
      /*! \brief A boolean which tells if the GPU timing events are enabled.
       *  False by default, can be enabled by setting the environment variable GMX_ENABLE_GPU_TIMING.
       *  Note: will not be reliable when multiple GPU tasks are running concurrently on the same
       * device context, as CUDA events on multiple streams are untrustworthy.
       */
-    bool useTiming;
+    bool useTiming = false;
  
      //! Vector of FFT setups
      std::vector<std::unique_ptr<GpuParallel3dFft>> fftSetup;
@@ -112,37 +119,37 @@ struct PmeGpuSpecific
       * TODO: these should live in a clean buffered container type, and be refactored in the NB/cudautils as well.
       */
      /*! \brief The kernelParams.atoms.coordinates float element count (actual)*/
-    int coordinatesSize;
+    int coordinatesSize = 0;
      /*! \brief The kernelParams.atoms.coordinates float element count (reserved) */
-    int coordinatesSizeAlloc;
+    int coordinatesSizeAlloc = 0;
      /*! \brief The kernelParams.atoms.forces float element count (actual) */
-    int forcesSize;
+    int forcesSize = 0;
      /*! \brief The kernelParams.atoms.forces float element count (reserved) */
-    int forcesSizeAlloc;
+    int forcesSizeAlloc = 0;
      /*! \brief The kernelParams.atoms.gridlineIndices int element count (actual) */
-    int gridlineIndicesSize;
+    int gridlineIndicesSize = 0;
      /*! \brief The kernelParams.atoms.gridlineIndices int element count (reserved) */
-    int gridlineIndicesSizeAlloc;
+    int gridlineIndicesSizeAlloc = 0;
      /*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (actual) */
-    int splineDataSize;
+    int splineDataSize = 0;
      /*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (reserved) */
-    int splineDataSizeAlloc;
+    int splineDataSizeAlloc = 0;
      /*! \brief The kernelParams.atoms.coefficients float element count (actual) */
-    int coefficientsSize;
+    int coefficientsSize = 0;
      /*! \brief The kernelParams.atoms.coefficients float element count (reserved) */
-    int coefficientsSizeAlloc;
+    int coefficientsSizeAlloc = 0;
      /*! \brief The kernelParams.grid.splineValuesArray float element count (actual) */
-    int splineValuesSize;
+    int splineValuesSize = 0;
      /*! \brief The kernelParams.grid.splineValuesArray float element count (reserved) */
-    int splineValuesSizeAlloc;
+    int splineValuesSizeAlloc = 0;
      /*! \brief The kernelParams.grid.realGrid float element count (actual) */
-    int realGridSize;
+    int realGridSize = 0;
      /*! \brief The kernelParams.grid.realGrid float element count (reserved) */
-    int realGridSizeAlloc;
+    int realGridSizeAlloc = 0;
      /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (actual) */
-    int complexGridSize;
+    int complexGridSize = 0;
      /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (reserved) */
-    int complexGridSizeAlloc;
+    int complexGridSizeAlloc = 0;
  };
  
  #endif
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp

index eed93cd2e3809189e7f3a433f69cbbe9bd9c10aa..2ee17b32674959e1d78363a29ed7d87420a8d1e3 100644 (file)
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -628,8 +628,8 @@ int gmx_pmeonly(struct gmx_pme_t*         pme,
      const bool useGpuForPme = (runMode == PmeRunMode::GPU) || (runMode == PmeRunMode::Mixed);
      if (useGpuForPme)
      {
-        const void* commandStream = pme_gpu_get_device_stream(pme);
-        const void* deviceContext = pme_gpu_get_device_context(pme);
+        const void*          commandStream = pme_gpu_get_device_stream(pme);
+        const DeviceContext& deviceContext = *pme_gpu_get_device_context(pme);
  
          changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
          changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu

index 827a1bab343f956694a8b09435bed71d99d2195c..29cb73d0cabeadfe92db0e5d8a1a3e7f58d5dbc8 100644 (file)
--- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
+++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -73,7 +73,8 @@ void PmePpCommGpu::Impl::reinit(int size)
      MPI_Recv(&remotePmeFBuffer_, sizeof(void**), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
  
      // Reallocate buffer used for staging PME force on GPU
-    reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, nullptr);
+    reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_,
+                           DeviceContext());
  #else
      GMX_UNUSED_VALUE(size);
  #endif
diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp

index c5b93d94c2abc3dcf3df331659f2a83719f85cb5..81edf195feb98a8a58d4c5a12d5ab2df89163ac1 100644 (file)
--- a/src/gromacs/ewald/tests/pmetestcommon.cpp
+++ b/src/gromacs/ewald/tests/pmetestcommon.cpp
@@ -166,7 +166,7 @@ std::unique_ptr<StatePropagatorDataGpu> makeStatePropagatorDataGpu(const gmx_pme
      // TODO: Special constructor for PME-only rank / PME-tests is used here. There should be a mechanism to
      //       restrict one from using other constructor here.
      return std::make_unique<StatePropagatorDataGpu>(
-            pme_gpu_get_device_stream(&pme), pme_gpu_get_device_context(&pme),
+            pme_gpu_get_device_stream(&pme), *pme_gpu_get_device_context(&pme),
              GpuApiCallBehavior::Sync, pme_gpu_get_padding_size(&pme), nullptr);
  }
  
diff --git a/src/gromacs/gpu_utils/CMakeLists.txt b/src/gromacs/gpu_utils/CMakeLists.txt

index 942141d288cdfca479ac8b121349a40763d5ccc5..8672e450ca175fe46608e8672013980c89808139 100644 (file)
--- a/src/gromacs/gpu_utils/CMakeLists.txt
+++ b/src/gromacs/gpu_utils/CMakeLists.txt
@@ -1,7 +1,7 @@
  #
  # This file is part of the GROMACS molecular simulation package.
  #
-# Copyright (c) 2015,2016,2017,2018,2019, by the GROMACS development team, led by
+# Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -44,6 +44,7 @@ gmx_add_libgromacs_sources(
          )
  if(GMX_USE_OPENCL)
      gmx_add_libgromacs_sources(
+        device_context_ocl.cpp
          gpu_utils_ocl.cpp
          ocl_compiler.cpp
          ocl_caching.cpp
diff --git a/src/gromacs/gpu_utils/device_context.h b/src/gromacs/gpu_utils/device_context.h

new file mode 100644 (file)

index 0000000..d192b55
--- /dev/null
+++ b/src/gromacs/gpu_utils/device_context.h
@@ -0,0 +1,83 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_GPU_UTILS_DEVICE_CONTEXT_H
+#define GMX_GPU_UTILS_DEVICE_CONTEXT_H
+
+/*! \libinternal \file
+ *
+ * \brief Declarations for DeviceContext class.
+ *
+ * Only needed for OpenCL builds. Other platforms will be given a stub class.
+ *
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ * \inlibraryapi
+ */
+
+#include "config.h"
+
+#if GMX_GPU == GMX_GPU_OPENCL
+#    include "gromacs/gpu_utils/device_context_ocl.h"
+#else
+#    include "gromacs/utility/classhelpers.h"
+
+struct DeviceInformation;
+
+// Stub for device context
+class DeviceContext
+{
+public:
+    //! Default constructor. In OpenCL leaves context \c nullptr.
+    DeviceContext() {}
+    /*! \brief Second stage of construction. Creates the \c cl_context in OpenCL, does nothing in CUDA.
+     *
+     * \param[in] deviceInfo Platform-specific device information.
+     */
+    void init(const DeviceInformation& /*deviceInfo*/) {}
+    /*! \brief Construct the object and call \c init(...) .
+     *
+     * \param[in] deviceInfo Platform-specific device information.
+     */
+    DeviceContext(const DeviceInformation& deviceInfo) { init(deviceInfo); }
+    //! Destructor
+    ~DeviceContext() = default;
+
+    GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceContext);
+};
+#endif // GMX_GPU != GMX_GPU_OPENCL
+
+#endif // GMX_GPU_UTILS_DEVICE_CONTEXT_H
diff --git a/src/gromacs/gpu_utils/device_context_ocl.cpp b/src/gromacs/gpu_utils/device_context_ocl.cpp

new file mode 100644 (file)

index 0000000..e5dfd59
--- /dev/null
+++ b/src/gromacs/gpu_utils/device_context_ocl.cpp
@@ -0,0 +1,122 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements the DeviceContext for OpenCL
+ *
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ */
+#include "gmxpre.h"
+
+#include "device_context_ocl.h"
+
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
+
+/*! \brief Copies of values from cl_driver_diagnostics_intel.h,
+ * which isn't guaranteed to be available. */
+/**@{*/
+#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL 0x1
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL 0x2
+#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL 0x4
+/**@}*/
+
+DeviceContext::DeviceContext()
+{
+    context_ = nullptr;
+}
+
+void DeviceContext::init(const DeviceInformation& deviceInfo)
+{
+    cl_platform_id                     platformId = deviceInfo.oclPlatformId;
+    cl_device_id                       deviceId   = deviceInfo.oclDeviceId;
+    std::vector<cl_context_properties> contextProperties;
+
+    contextProperties.emplace_back(CL_CONTEXT_PLATFORM);
+    contextProperties.emplace_back(reinterpret_cast<cl_context_properties>(platformId));
+
+    if (getenv("GMX_OCL_SHOW_DIAGNOSTICS"))
+    {
+        contextProperties.emplace_back(CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL);
+        contextProperties.emplace_back(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL
+                                       | CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL);
+    }
+    contextProperties.emplace_back(0);
+
+    cl_int clError;
+    context_ = clCreateContext(contextProperties.data(), 1, &deviceId, nullptr, nullptr, &clError);
+    if (clError != CL_SUCCESS)
+    {
+        GMX_THROW(gmx::InternalError(gmx::formatString(
+                "Failed to create OpenCL context on device %s (OpenCL error ID %d).",
+                deviceInfo.device_name, clError)));
+    }
+}
+
+DeviceContext::DeviceContext(const DeviceInformation& deviceInfo)
+{
+    init(deviceInfo);
+}
+
+DeviceContext::~DeviceContext()
+{
+    cl_int clError;
+
+    if (context_)
+    {
+        clError = clReleaseContext(context_);
+        GMX_RELEASE_ASSERT(
+                clError == CL_SUCCESS,
+                gmx::formatString("Failed to release OpenCL context (OpenCL error ID %d).", clError).c_str());
+        context_ = nullptr;
+    }
+}
+
+cl_context DeviceContext::context() const
+{
+    return context_;
+}
+
+void DeviceContext::setContext(cl_context context)
+{
+    context_ = context;
+}
diff --git a/src/gromacs/gpu_utils/device_context_ocl.h b/src/gromacs/gpu_utils/device_context_ocl.h

new file mode 100644 (file)

index 0000000..58bb755
--- /dev/null
+++ b/src/gromacs/gpu_utils/device_context_ocl.h
@@ -0,0 +1,92 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef GMX_GPU_UTILS_DEVICE_CONTEXT_OCL_H
+#define GMX_GPU_UTILS_DEVICE_CONTEXT_OCL_H
+
+/*! \libinternal \file
+ *
+ * \brief Declarations for DeviceContext class.
+ *
+ * Only needed for OpenCL builds. Other platforms will be given a stub class.
+ *
+ * \author Mark Abraham <mark.j.abraham@gmail.com>
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_gpu_utils
+ * \inlibraryapi
+ */
+
+#include "gromacs/gpu_utils/gmxopencl.h"
+#include "gromacs/utility/classhelpers.h"
+
+struct DeviceInformation;
+
+// OpenCL device context class
+class DeviceContext
+{
+public:
+    //! Default constructor. Sets \c context_ to \c nullptr.
+    DeviceContext();
+    /*! \brief Second stage of construction. Creates the \c cl_context.
+     *
+     * \param[in] deviceInfo Platform-specific device information.
+     *
+     * \throws InternalError if context creation failed.
+     */
+    void init(const DeviceInformation& deviceInfo);
+    /*! \brief Construct the object and call \c init(...) .
+     *
+     * \param[in] deviceInfo Platform-specific device information.
+     *
+     * \throws InternalError if context creation failed.
+     */
+    DeviceContext(const DeviceInformation& deviceInfo);
+    //! Destructor
+    ~DeviceContext();
+
+    //! Getter
+    cl_context context() const;
+
+    //! Transition time setter - will be removed
+    void setContext(cl_context context);
+
+    GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(DeviceContext);
+
+private:
+    //! OpenCL context object
+    cl_context context_ = nullptr;
+};
+
+#endif // GMX_GPU_UTILS_DEVICE_CONTEXT_OCL_H
diff --git a/src/gromacs/gpu_utils/devicebuffer.cuh b/src/gromacs/gpu_utils/devicebuffer.cuh

index 3a1bd3d19534c6bad34fd36e2e488e21191d5e75..59255bfa9332bdfd769f0739094a6452a8bc3688 100644 (file)
--- a/src/gromacs/gpu_utils/devicebuffer.cuh
+++ b/src/gromacs/gpu_utils/devicebuffer.cuh
@@ -45,6 +45,7 @@
   *  \inlibraryapi
   */
  
+#include "gromacs/gpu_utils/device_context.h"
  #include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/gpu_utils/gpu_utils.h" //only for GpuApiCallBehavior
  #include "gromacs/gpu_utils/gputraits.cuh"
@@ -60,7 +61,7 @@
   * \param[in]     deviceContext        The buffer's dummy device  context - not managed explicitly in CUDA RT.
   */
  template<typename ValueType>
-void allocateDeviceBuffer(DeviceBuffer<ValueType>* buffer, size_t numValues, DeviceContext /* deviceContext */)
+void allocateDeviceBuffer(DeviceBuffer<ValueType>* buffer, size_t numValues, const DeviceContext& /* deviceContext */)
  {
      GMX_ASSERT(buffer, "needs a buffer pointer");
      cudaError_t stat = cudaMalloc((void**)buffer, numValues * sizeof(ValueType));
diff --git a/src/gromacs/gpu_utils/devicebuffer.h b/src/gromacs/gpu_utils/devicebuffer.h

index 4c28d49613b2456396c59cc5caa670fdb6fa7587..c0cdfec32963b3a058477439fe9e1a7700349183 100644 (file)
--- a/src/gromacs/gpu_utils/devicebuffer.h
+++ b/src/gromacs/gpu_utils/devicebuffer.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -80,7 +80,7 @@ void reallocateDeviceBuffer(DeviceBuffer<ValueType>* buffer,
                              size_t                   numValues,
                              int*                     currentNumValues,
                              int*                     currentMaxNumValues,
-                            DeviceContext            deviceContext)
+                            const DeviceContext&     deviceContext)
  {
      GMX_ASSERT(buffer, "needs a buffer pointer");
      GMX_ASSERT(currentNumValues, "needs a size pointer");
diff --git a/src/gromacs/gpu_utils/devicebuffer_ocl.h b/src/gromacs/gpu_utils/devicebuffer_ocl.h

index 40f1e12941c5f40d010459abb4394d8ccdf22458..ee1adc1cce89e48958309aa55d4d01c0436a9899 100644 (file)
--- a/src/gromacs/gpu_utils/devicebuffer_ocl.h
+++ b/src/gromacs/gpu_utils/devicebuffer_ocl.h
@@ -45,6 +45,7 @@
   *  \inlibraryapi
   */
  
+#include "gromacs/gpu_utils/device_context.h"
  #include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/gpu_utils/gpu_utils.h" //only for GpuApiCallBehavior
  #include "gromacs/gpu_utils/gputraits_ocl.h"
@@ -62,13 +63,13 @@
   * \param[in]     deviceContext        The buffer's device context-to-be.
   */
  template<typename ValueType>
-void allocateDeviceBuffer(DeviceBuffer<ValueType>* buffer, size_t numValues, DeviceContext deviceContext)
+void allocateDeviceBuffer(DeviceBuffer<ValueType>* buffer, size_t numValues, const DeviceContext& deviceContext)
  {
      GMX_ASSERT(buffer, "needs a buffer pointer");
      void*  hostPtr = nullptr;
      cl_int clError;
-    *buffer = clCreateBuffer(deviceContext, CL_MEM_READ_WRITE, numValues * sizeof(ValueType),
-                             hostPtr, &clError);
+    *buffer = clCreateBuffer(deviceContext.context(), CL_MEM_READ_WRITE,
+                             numValues * sizeof(ValueType), hostPtr, &clError);
      GMX_RELEASE_ASSERT(clError == CL_SUCCESS,
                         gmx::formatString("clCreateBuffer failure (OpenCL error %d: %s)", clError,
                                           ocl_get_error_string(clError).c_str())
diff --git a/src/gromacs/gpu_utils/gputraits.cuh b/src/gromacs/gpu_utils/gputraits.cuh

index 8a4936dabcf801376e1df06b5e33d268fc136d97..b477cdcb4ce4b5b5bbbf6a20efaeb8df28bf7f25 100644 (file)
--- a/src/gromacs/gpu_utils/gputraits.cuh
+++ b/src/gromacs/gpu_utils/gputraits.cuh
@@ -37,7 +37,9 @@
  
  /*! \libinternal \file
   *  \brief Declares the CUDA type traits.
+ *
   *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *  \author Artem Zhmurov <zhmurov@gmail.com>
   *
   * \inlibraryapi
   * \ingroup module_gpu_utils
@@ -63,8 +65,6 @@ struct DeviceInformation
  using CommandStream = cudaStream_t;
  //! \brief Single GPU call timing event - meaningless in CUDA
  using CommandEvent = void;
-//! \brief Context used explicitly in OpenCL, does nothing in CUDA
-using DeviceContext = void*;
  
  /*! \internal \brief
   * GPU kernels scheduling description. This is same in OpenCL/CUDA.
diff --git a/src/gromacs/gpu_utils/gputraits.h b/src/gromacs/gpu_utils/gputraits.h

index 0229ea443c60c7454a99e50c1751637734dd1964..a36a5cc3bcf5b8ad3db267a42f873c4547712147 100644 (file)
--- a/src/gromacs/gpu_utils/gputraits.h
+++ b/src/gromacs/gpu_utils/gputraits.h
@@ -36,8 +36,10 @@
  #define GMX_GPU_UTILS_GPUTRAITS_H
  
  /*! \libinternal \file
- *  \brief Declares the GPU type traits for non-GPU builds
+ *  \brief Declares the GPU type traits for non-GPU builds.
+ *
   *  \author Mark Abraham <mark.j.abraham@gmail.com>
+ *  \author Artem Zhmurov <zhmurov@gmail.com>
   *
   * \inlibraryapi
   * \ingroup module_gpu_utils
@@ -65,8 +67,6 @@ struct DeviceInformation
  using CommandStream = void*;
  //! \brief Single GPU call timing event
  using CommandEvent = void*;
-//! \brief GPU context
-using DeviceContext = void*;
  
  #endif // GMX_GPU
  
diff --git a/src/gromacs/gpu_utils/gputraits_ocl.h b/src/gromacs/gpu_utils/gputraits_ocl.h

index 00d9cba90d95de2b2e52a8b5055aeea07ed2d371..caf837552a453222ede9f11df25f7bc3a9987353 100644 (file)
--- a/src/gromacs/gpu_utils/gputraits_ocl.h
+++ b/src/gromacs/gpu_utils/gputraits_ocl.h
@@ -37,7 +37,9 @@
  
  /*! \libinternal \file
   *  \brief Declares the OpenCL type traits.
+ *
   *  \author Aleksei Iupinov <a.yupinov@gmail.com>
+ *  \author Artem Zhmurov <zhmurov@gmail.com>
   *
   * \inlibraryapi
   * \ingroup module_gpu_utils
@@ -81,8 +83,6 @@ struct DeviceInformation
  using CommandStream = cl_command_queue;
  //! \brief Single GPU call timing event
  using CommandEvent = cl_event;
-//! \brief Context used explicitly in OpenCL
-using DeviceContext = cl_context;
  
  /*! \internal \brief
   * GPU kernels scheduling description. This is same in OpenCL/CUDA.
diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h

index 91b6059d27c5bf1838fdfec1e000015ac6b4a205..230b3ff94e1b0b43f56faf92a5c2cde897272671 100644 (file)
--- a/src/gromacs/gpu_utils/oclutils.h
+++ b/src/gromacs/gpu_utils/oclutils.h
@@ -44,6 +44,7 @@
  
  #include <string>
  
+#include "gromacs/gpu_utils/device_context.h"
  #include "gromacs/gpu_utils/gmxopencl.h"
  #include "gromacs/gpu_utils/gputraits_ocl.h"
  #include "gromacs/utility/exceptions.h"
@@ -63,8 +64,10 @@ enum class GpuApiCallBehavior;
   */
  struct gmx_device_runtime_data_t
  {
-    cl_context context; /**< OpenCL context */
-    cl_program program; /**< OpenCL program */
+    //! OpenCL context
+    DeviceContext deviceContext;
+    //! OpenCL program
+    cl_program program;
  };
  
  /*! \brief Launches synchronous or asynchronous device to host memory copy.
diff --git a/src/gromacs/gpu_utils/tests/typecasts_runner.cu b/src/gromacs/gpu_utils/tests/typecasts_runner.cu

index 221959fe79589dab3c7cf292f78e99bc5a34fbd4..e16dd8ebf546bd201fc40ab3fe8ad532a3b2b47a 100644 (file)
--- a/src/gromacs/gpu_utils/tests/typecasts_runner.cu
+++ b/src/gromacs/gpu_utils/tests/typecasts_runner.cu
@@ -110,15 +110,18 @@ static __global__ void convertRVecToFloat3OnDevice_kernel(DeviceBuffer<float3> g
  
  void convertRVecToFloat3OnDevice(std::vector<gmx::RVec>& h_rVecOutput, const std::vector<gmx::RVec>& h_rVecInput)
  {
+    DeviceInformation   deviceInfo;
+    const DeviceContext deviceContext(deviceInfo);
+
      const int numElements = h_rVecInput.size();
  
      DeviceBuffer<RVec> d_rVecInput;
-    allocateDeviceBuffer(&d_rVecInput, numElements, nullptr);
+    allocateDeviceBuffer(&d_rVecInput, numElements, deviceContext);
      copyToDeviceBuffer(&d_rVecInput, h_rVecInput.data(), 0, numElements, nullptr,
                         GpuApiCallBehavior::Sync, nullptr);
  
      DeviceBuffer<float3> d_float3Output;
-    allocateDeviceBuffer(&d_float3Output, numElements * DIM, nullptr);
+    allocateDeviceBuffer(&d_float3Output, numElements * DIM, deviceContext);
  
      std::vector<float3> h_float3Output(numElements);
  
diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu

index bc10e76066178187310a8977ff79d5013a3372b0..ff7092f40cd1719b0a4ffd4cb8f49bd875a417a9 100644 (file)
--- a/src/gromacs/listed_forces/gpubonded_impl.cu
+++ b/src/gromacs/listed_forces/gpubonded_impl.cu
@@ -68,14 +68,14 @@ GpuBonded::Impl::Impl(const gmx_ffparams_t& ffparams, void* streamPtr, gmx_wallc
      stream_ = *static_cast<CommandStream*>(streamPtr);
      wcycle_ = wcycle;
  
-    allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), nullptr);
+    allocateDeviceBuffer(&d_forceParams_, ffparams.numTypes(), deviceContext_);
      // This could be an async transfer (if the source is pinned), so
      // long as it uses the same stream as the kernels and we are happy
      // to consume additional pinned pages.
      copyToDeviceBuffer(&d_forceParams_, ffparams.iparams.data(), 0, ffparams.numTypes(), stream_,
                         GpuApiCallBehavior::Sync, nullptr);
      vTot_.resize(F_NRE);
-    allocateDeviceBuffer(&d_vTot_, F_NRE, nullptr);
+    allocateDeviceBuffer(&d_vTot_, F_NRE, deviceContext_);
      clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, stream_);
  
      kernelParams_.d_forceParams = d_forceParams_;
@@ -204,7 +204,8 @@ void GpuBonded::Impl::updateInteractionListsAndDeviceBuffers(ArrayRef<const int>
          {
              t_ilist& d_iList = d_iLists_[fType];
  
-            reallocateDeviceBuffer(&d_iList.iatoms, iList.size(), &d_iList.nr, &d_iList.nalloc, nullptr);
+            reallocateDeviceBuffer(&d_iList.iatoms, iList.size(), &d_iList.nr, &d_iList.nalloc,
+                                   deviceContext_);
  
              copyToDeviceBuffer(&d_iList.iatoms, iList.iatoms.data(), 0, iList.size(), stream_,
                                 GpuApiCallBehavior::Async, nullptr);
diff --git a/src/gromacs/listed_forces/gpubonded_impl.h b/src/gromacs/listed_forces/gpubonded_impl.h

index a785b16dbba1584df4276b987bdcbc418a87cd16..0532b40315a0b893c6ec4457d5a4714690f7cac6 100644 (file)
--- a/src/gromacs/listed_forces/gpubonded_impl.h
+++ b/src/gromacs/listed_forces/gpubonded_impl.h
@@ -48,6 +48,7 @@
  #ifndef GMX_LISTED_FORCES_GPUBONDED_IMPL_H
  #define GMX_LISTED_FORCES_GPUBONDED_IMPL_H
  
+#include "gromacs/gpu_utils/device_context.h"
  #include "gromacs/gpu_utils/gputraits.cuh"
  #include "gromacs/gpu_utils/hostallocator.h"
  #include "gromacs/listed_forces/gpubonded.h"
@@ -179,6 +180,8 @@ private:
      //! \brief Device-side total virial
      float* d_vTot_ = nullptr;
  
+    //! Dummy GPU context object
+    const DeviceContext deviceContext_;
      //! \brief Bonded GPU stream, not owned by this module
      CommandStream stream_;
  
diff --git a/src/gromacs/mdlib/leapfrog_gpu.cu b/src/gromacs/mdlib/leapfrog_gpu.cu

index 61bc231e3660262406ed3e8ef49c79595a1fa040..b77162c1af47f363499338346efc02826f8057af 100644 (file)
--- a/src/gromacs/mdlib/leapfrog_gpu.cu
+++ b/src/gromacs/mdlib/leapfrog_gpu.cu
@@ -316,7 +316,9 @@ void LeapFrogGpu::integrate(const float3*                     d_x,
      return;
  }
  
-LeapFrogGpu::LeapFrogGpu(CommandStream commandStream) : commandStream_(commandStream)
+LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, CommandStream commandStream) :
+    deviceContext_(deviceContext),
+    commandStream_(commandStream)
  {
      numAtoms_ = 0;
  
@@ -342,7 +344,7 @@ void LeapFrogGpu::set(const t_mdatoms& md, const int numTempScaleValues, const u
      numTempScaleValues_ = numTempScaleValues;
  
      reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_, &numInverseMasses_,
-                           &numInverseMassesAlloc_, nullptr);
+                           &numInverseMassesAlloc_, deviceContext_);
      copyToDeviceBuffer(&d_inverseMasses_, (float*)md.invmass, 0, numAtoms_, commandStream_,
                         GpuApiCallBehavior::Sync, nullptr);
  
@@ -350,7 +352,7 @@ void LeapFrogGpu::set(const t_mdatoms& md, const int numTempScaleValues, const u
      if (numTempScaleValues > 1)
      {
          reallocateDeviceBuffer(&d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_,
-                               &numTempScaleGroupsAlloc_, nullptr);
+                               &numTempScaleGroupsAlloc_, deviceContext_);
          copyToDeviceBuffer(&d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, commandStream_,
                             GpuApiCallBehavior::Sync, nullptr);
      }
@@ -359,7 +361,8 @@ void LeapFrogGpu::set(const t_mdatoms& md, const int numTempScaleValues, const u
      if (numTempScaleValues_ > 0)
      {
          h_lambdas_.resize(numTempScaleValues);
-        reallocateDeviceBuffer(&d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, nullptr);
+        reallocateDeviceBuffer(&d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_,
+                               deviceContext_);
      }
  }
  
diff --git a/src/gromacs/mdlib/leapfrog_gpu.cuh b/src/gromacs/mdlib/leapfrog_gpu.cuh

index ba6d1be3f209437744959e4fbe18de8ddc4bb649..98703c05b94121b8dcaaff96910d6e8b0451a909 100644 (file)
--- a/src/gromacs/mdlib/leapfrog_gpu.cuh
+++ b/src/gromacs/mdlib/leapfrog_gpu.cuh
@@ -44,6 +44,7 @@
  #ifndef GMX_MDLIB_LEAPFROG_GPU_CUH
  #define GMX_MDLIB_LEAPFROG_GPU_CUH
  
+#include "gromacs/gpu_utils/device_context.h"
  #include "gromacs/gpu_utils/gputraits.cuh"
  #include "gromacs/gpu_utils/hostallocator.h"
  #include "gromacs/mdtypes/group.h"
@@ -62,9 +63,10 @@ class LeapFrogGpu
  public:
      /*! \brief Constructor.
       *
+     * \param[in] deviceContext  Device context (dummy in CUDA).
       * \param[in] commandStream  Device command stream to use.
       */
-    LeapFrogGpu(CommandStream commandStream);
+    LeapFrogGpu(const DeviceContext& deviceContext, CommandStream commandStream);
      ~LeapFrogGpu();
  
      /*! \brief Integrate
@@ -110,6 +112,8 @@ public:
      class Impl;
  
  private:
+    //! Dummy GPU context object
+    const DeviceContext& deviceContext_;
      //! GPU stream
      CommandStream commandStream_;
      //! GPU kernel launch config
diff --git a/src/gromacs/mdlib/lincs_gpu.cu b/src/gromacs/mdlib/lincs_gpu.cu

index edf3e9c58a27d799b6395197ed92af13c59101c0..03c1bd1d15ed4f5dfcab5d91ae2c3e3b5ee11239 100644 (file)
--- a/src/gromacs/mdlib/lincs_gpu.cu
+++ b/src/gromacs/mdlib/lincs_gpu.cu
@@ -507,7 +507,11 @@ void LincsGpu::apply(const float3* d_x,
      return;
  }
  
-LincsGpu::LincsGpu(int numIterations, int expansionOrder, CommandStream commandStream) :
+LincsGpu::LincsGpu(int                  numIterations,
+                   int                  expansionOrder,
+                   const DeviceContext& deviceContext,
+                   CommandStream        commandStream) :
+    deviceContext_(deviceContext),
      commandStream_(commandStream)
  {
      kernelParams_.numIterations  = numIterations;
@@ -519,7 +523,7 @@ LincsGpu::LincsGpu(int numIterations, int expansionOrder, CommandStream commandS
              c_threadsPerBlock > 0 && ((c_threadsPerBlock & (c_threadsPerBlock - 1)) == 0),
              "Number of threads per block should be a power of two in order for reduction to work.");
  
-    allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, nullptr);
+    allocateDeviceBuffer(&kernelParams_.d_virialScaled, 6, deviceContext_);
      h_virialScaled_.resize(6);
  
      // The data arrays should be expanded/reallocated on first call of set() function.
@@ -911,18 +915,19 @@ void LincsGpu::set(const InteractionDefinitions& idef, const t_mdatoms& md)
  
          numConstraintsThreadsAlloc_ = kernelParams_.numConstraintsThreads;
  
-        allocateDeviceBuffer(&kernelParams_.d_constraints, kernelParams_.numConstraintsThreads, nullptr);
+        allocateDeviceBuffer(&kernelParams_.d_constraints, kernelParams_.numConstraintsThreads,
+                             deviceContext_);
          allocateDeviceBuffer(&kernelParams_.d_constraintsTargetLengths,
-                             kernelParams_.numConstraintsThreads, nullptr);
+                             kernelParams_.numConstraintsThreads, deviceContext_);
  
          allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsCounts,
-                             kernelParams_.numConstraintsThreads, nullptr);
+                             kernelParams_.numConstraintsThreads, deviceContext_);
          allocateDeviceBuffer(&kernelParams_.d_coupledConstraintsIndices,
-                             maxCoupledConstraints * kernelParams_.numConstraintsThreads, nullptr);
+                             maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceContext_);
          allocateDeviceBuffer(&kernelParams_.d_massFactors,
-                             maxCoupledConstraints * kernelParams_.numConstraintsThreads, nullptr);
+                             maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceContext_);
          allocateDeviceBuffer(&kernelParams_.d_matrixA,
-                             maxCoupledConstraints * kernelParams_.numConstraintsThreads, nullptr);
+                             maxCoupledConstraints * kernelParams_.numConstraintsThreads, deviceContext_);
      }
  
      // (Re)allocate the memory, if the number of atoms has increased.
@@ -933,7 +938,7 @@ void LincsGpu::set(const InteractionDefinitions& idef, const t_mdatoms& md)
              freeDeviceBuffer(&kernelParams_.d_inverseMasses);
          }
          numAtomsAlloc_ = numAtoms;
-        allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms, nullptr);
+        allocateDeviceBuffer(&kernelParams_.d_inverseMasses, numAtoms, deviceContext_);
      }
  
      // Copy data to GPU.
diff --git a/src/gromacs/mdlib/lincs_gpu.cuh b/src/gromacs/mdlib/lincs_gpu.cuh

index 0fbebcf67fc762181958e20e04c08554eed5e81f..77423dc3231fee737151501664f2ef9f40d7dfe9 100644 (file)
--- a/src/gromacs/mdlib/lincs_gpu.cuh
+++ b/src/gromacs/mdlib/lincs_gpu.cuh
@@ -44,6 +44,7 @@
  #ifndef GMX_MDLIB_LINCS_GPU_CUH
  #define GMX_MDLIB_LINCS_GPU_CUH
  
+#include "gromacs/gpu_utils/device_context.h"
  #include "gromacs/gpu_utils/gputraits.cuh"
  #include "gromacs/mdlib/constr.h"
  #include "gromacs/mdtypes/mdatom.h"
@@ -103,9 +104,10 @@ public:
       *
       * \param[in] numIterations    Number of iteration for the correction of the projection.
       * \param[in] expansionOrder   Order of the matrix inversion algorithm.
+     * \param[in] deviceContext    Device context (dummy in CUDA).
       * \param[in] commandStream    Device command stream.
       */
-    LincsGpu(int numIterations, int expansionOrder, CommandStream commandStream);
+    LincsGpu(int numIterations, int expansionOrder, const DeviceContext& deviceContext, CommandStream commandStream);
      /*! \brief Destructor.*/
      ~LincsGpu();
  
@@ -167,6 +169,8 @@ public:
      static bool isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop);
  
  private:
+    //! Dummy GPU context object
+    const DeviceContext& deviceContext_;
      //! GPU stream
      CommandStream commandStream_;
  
diff --git a/src/gromacs/mdlib/settle_gpu.cu b/src/gromacs/mdlib/settle_gpu.cu

index d1e8f508d504e7789d917a56a5e3a71c2c69222a..20933baf965604f7d72e04f5dd02a12ae3bbeeca 100644 (file)
--- a/src/gromacs/mdlib/settle_gpu.cu
+++ b/src/gromacs/mdlib/settle_gpu.cu
@@ -485,7 +485,8 @@ void SettleGpu::apply(const float3* d_x,
      return;
  }
  
-SettleGpu::SettleGpu(const gmx_mtop_t& mtop, CommandStream commandStream) :
+SettleGpu::SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, CommandStream commandStream) :
+    deviceContext_(deviceContext),
      commandStream_(commandStream)
  {
      static_assert(sizeof(real) == sizeof(float),
@@ -586,7 +587,7 @@ SettleGpu::SettleGpu(const gmx_mtop_t& mtop, CommandStream commandStream) :
  
      initSettleParameters(&settleParameters_, mO, mH, dOH, dHH);
  
-    allocateDeviceBuffer(&d_virialScaled_, 6, nullptr);
+    allocateDeviceBuffer(&d_virialScaled_, 6, deviceContext_);
      h_virialScaled_.resize(6);
  }
  
@@ -611,7 +612,7 @@ void SettleGpu::set(const InteractionDefinitions& idef, const t_mdatoms gmx_unus
      ArrayRef<const int>    iatoms    = il_settle.iatoms;
      numSettles_                      = il_settle.size() / nral1;
  
-    reallocateDeviceBuffer(&d_atomIds_, numSettles_, &numAtomIds_, &numAtomIdsAlloc_, nullptr);
+    reallocateDeviceBuffer(&d_atomIds_, numSettles_, &numAtomIds_, &numAtomIdsAlloc_, deviceContext_);
      h_atomIds_.resize(numSettles_);
      for (int i = 0; i < numSettles_; i++)
      {
diff --git a/src/gromacs/mdlib/settle_gpu.cuh b/src/gromacs/mdlib/settle_gpu.cuh

index 3816579d638a0ac5cc956ea54938726b5941501c..f07af017e33fdaba6b53382465b6da76d340a7a3 100644 (file)
--- a/src/gromacs/mdlib/settle_gpu.cuh
+++ b/src/gromacs/mdlib/settle_gpu.cuh
@@ -45,6 +45,7 @@
  
  #include "gmxpre.h"
  
+#include "gromacs/gpu_utils/device_context.h"
  #include "gromacs/gpu_utils/gputraits.cuh"
  #include "gromacs/math/functions.h"
  #include "gromacs/math/invertmatrix.h"
@@ -200,9 +201,10 @@ public:
       * \param[in] mtop           Topology of the system to gen the masses for O and H atoms and
       *                           target O-H and H-H distances. These values are also checked for
       *                           consistency.
+     * \param[in] deviceContext  Device context (dummy in CUDA).
       * \param[in] commandStream  Device stream to use.
       */
-    SettleGpu(const gmx_mtop_t& mtop, CommandStream commandStream);
+    SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, CommandStream commandStream);
  
      ~SettleGpu();
  
@@ -250,6 +252,8 @@ public:
      void set(const InteractionDefinitions& idef, const t_mdatoms& md);
  
  private:
+    //! Dummy GPU context object
+    const DeviceContext& deviceContext_;
      //! GPU stream
      CommandStream commandStream_;
  
diff --git a/src/gromacs/mdlib/tests/constrtestrunners.cu b/src/gromacs/mdlib/tests/constrtestrunners.cu

index 322c3beec9370afae9454c8c4983dd8b24a48e0e..5c0a007ee4461f96ca4b9298c29391be3041d38f 100644 (file)
--- a/src/gromacs/mdlib/tests/constrtestrunners.cu
+++ b/src/gromacs/mdlib/tests/constrtestrunners.cu
@@ -70,8 +70,11 @@ namespace test
   */
  void applyLincsGpu(ConstraintsTestData* testData, t_pbc pbc)
  {
-    auto lincsGpu =
-            std::make_unique<LincsGpu>(testData->ir_.nLincsIter, testData->ir_.nProjOrder, nullptr);
+    DeviceInformation   deviceInfo;
+    const DeviceContext deviceContext(deviceInfo);
+
+    auto lincsGpu = std::make_unique<LincsGpu>(testData->ir_.nLincsIter, testData->ir_.nProjOrder,
+                                               deviceContext, nullptr);
  
      bool    updateVelocities = true;
      int     numAtoms         = testData->numAtoms_;
@@ -81,9 +84,9 @@ void applyLincsGpu(ConstraintsTestData* testData, t_pbc pbc)
      PbcAiuc pbcAiuc;
      setPbcAiuc(pbc.ndim_ePBC, pbc.box, &pbcAiuc);
  
-    allocateDeviceBuffer(&d_x, numAtoms, nullptr);
-    allocateDeviceBuffer(&d_xp, numAtoms, nullptr);
-    allocateDeviceBuffer(&d_v, numAtoms, nullptr);
+    allocateDeviceBuffer(&d_x, numAtoms, deviceContext);
+    allocateDeviceBuffer(&d_xp, numAtoms, deviceContext);
+    allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
  
      copyToDeviceBuffer(&d_x, (float3*)(testData->x_.data()), 0, numAtoms, nullptr,
                         GpuApiCallBehavior::Sync, nullptr);
diff --git a/src/gromacs/mdlib/tests/leapfrogtestrunners.cu b/src/gromacs/mdlib/tests/leapfrogtestrunners.cu

index 1c3afdd1b00806340efabd4fce0d9a5a80b2e721..b794149ddb8a34fb51ebaf103be82f0428b32ffa 100644 (file)
--- a/src/gromacs/mdlib/tests/leapfrogtestrunners.cu
+++ b/src/gromacs/mdlib/tests/leapfrogtestrunners.cu
@@ -66,6 +66,9 @@ namespace test
  
  void integrateLeapFrogGpu(LeapFrogTestData* testData, int numSteps)
  {
+    DeviceInformation   deviceInfo;
+    const DeviceContext deviceContext(deviceInfo);
+
      int numAtoms = testData->numAtoms_;
  
      float3* h_x  = reinterpret_cast<float3*>(testData->x_.data());
@@ -75,17 +78,17 @@ void integrateLeapFrogGpu(LeapFrogTestData* testData, int numSteps)
  
      float3 *d_x, *d_xp, *d_v, *d_f;
  
-    allocateDeviceBuffer(&d_x, numAtoms, nullptr);
-    allocateDeviceBuffer(&d_xp, numAtoms, nullptr);
-    allocateDeviceBuffer(&d_v, numAtoms, nullptr);
-    allocateDeviceBuffer(&d_f, numAtoms, nullptr);
+    allocateDeviceBuffer(&d_x, numAtoms, deviceContext);
+    allocateDeviceBuffer(&d_xp, numAtoms, deviceContext);
+    allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
+    allocateDeviceBuffer(&d_f, numAtoms, deviceContext);
  
      copyToDeviceBuffer(&d_x, h_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
      copyToDeviceBuffer(&d_xp, h_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
      copyToDeviceBuffer(&d_v, h_v, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
      copyToDeviceBuffer(&d_f, h_f, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
  
-    auto integrator = std::make_unique<LeapFrogGpu>(nullptr);
+    auto integrator = std::make_unique<LeapFrogGpu>(deviceContext, nullptr);
  
      integrator->set(testData->mdAtoms_, testData->numTCoupleGroups_, testData->mdAtoms_.cTC);
  
diff --git a/src/gromacs/mdlib/tests/settletestrunners.cu b/src/gromacs/mdlib/tests/settletestrunners.cu

index a14b47e81952dcf7aff9a3e0b97a21917a12dc05..6ebc6688da98aa422cdfb67e9686bec66c7ca788 100644 (file)
--- a/src/gromacs/mdlib/tests/settletestrunners.cu
+++ b/src/gromacs/mdlib/tests/settletestrunners.cu
@@ -86,7 +86,10 @@ void applySettleGpu(SettleTestData*  testData,
      // TODO: Here we should check that at least 1 suitable GPU is available
      GMX_RELEASE_ASSERT(canPerformGpuDetection(), "Can't detect CUDA-capable GPUs.");
  
-    auto settleGpu = std::make_unique<SettleGpu>(testData->mtop_, nullptr);
+    DeviceInformation   deviceInfo;
+    const DeviceContext deviceContext(deviceInfo);
+
+    auto settleGpu = std::make_unique<SettleGpu>(testData->mtop_, deviceContext, nullptr);
  
      settleGpu->set(*testData->idef_, testData->mdatoms_);
      PbcAiuc pbcAiuc;
@@ -100,9 +103,9 @@ void applySettleGpu(SettleTestData*  testData,
      float3* h_xp = (float3*)(as_rvec_array(testData->xPrime_.data()));
      float3* h_v  = (float3*)(as_rvec_array(testData->v_.data()));
  
-    allocateDeviceBuffer(&d_x, numAtoms, nullptr);
-    allocateDeviceBuffer(&d_xp, numAtoms, nullptr);
-    allocateDeviceBuffer(&d_v, numAtoms, nullptr);
+    allocateDeviceBuffer(&d_x, numAtoms, deviceContext);
+    allocateDeviceBuffer(&d_xp, numAtoms, deviceContext);
+    allocateDeviceBuffer(&d_v, numAtoms, deviceContext);
  
      copyToDeviceBuffer(&d_x, (float3*)h_x, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
      copyToDeviceBuffer(&d_xp, (float3*)h_xp, 0, numAtoms, nullptr, GpuApiCallBehavior::Sync, nullptr);
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu

index c77e1924ed4f20308796d387af99fd772dea2e71..a8e5a94cc6d87bbf649d84743379744cf47ea860 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
@@ -175,9 +175,9 @@ UpdateConstrainGpu::Impl::Impl(const t_inputrec&     ir,
                               : commandStream_ = nullptr;
  
  
-    integrator_ = std::make_unique<LeapFrogGpu>(commandStream_);
-    lincsGpu_   = std::make_unique<LincsGpu>(ir.nLincsIter, ir.nProjOrder, commandStream_);
-    settleGpu_  = std::make_unique<SettleGpu>(mtop, commandStream_);
+    integrator_ = std::make_unique<LeapFrogGpu>(deviceContext_, commandStream_);
+    lincsGpu_ = std::make_unique<LincsGpu>(ir.nLincsIter, ir.nProjOrder, deviceContext_, commandStream_);
+    settleGpu_ = std::make_unique<SettleGpu>(mtop, deviceContext_, commandStream_);
  
      coordinateScalingKernelLaunchConfig_.blockSize[0]     = c_threadsPerBlock;
      coordinateScalingKernelLaunchConfig_.blockSize[1]     = 1;
@@ -205,10 +205,10 @@ void UpdateConstrainGpu::Impl::set(DeviceBuffer<RVec>            d_x,
  
      numAtoms_ = md.nr;
  
-    reallocateDeviceBuffer(&d_xp_, numAtoms_, &numXp_, &numXpAlloc_, nullptr);
+    reallocateDeviceBuffer(&d_xp_, numAtoms_, &numXp_, &numXpAlloc_, deviceContext_);
  
      reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_, &numInverseMasses_,
-                           &numInverseMassesAlloc_, nullptr);
+                           &numInverseMassesAlloc_, deviceContext_);
  
      // Integrator should also update something, but it does not even have a method yet
      integrator_->set(md, numTempScaleValues, md.cTC);
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h

index b835c7cf02bf0196dc21eee493f7e1d8674a5431..75b6814de0066627d4d1ed9e1d7d24ce95f2f9fc 100644 (file)
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.h
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h
@@ -163,6 +163,8 @@ public:
      static bool isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop);
  
  private:
+    //! Dummy GPU context object
+    const DeviceContext deviceContext_;
      //! GPU stream
      CommandStream commandStream_ = nullptr;
      //! GPU kernel launch config
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 9b31070072550f30e2b44614b4252e1b7f47de93..b233b0737c9540c019f12b869fa4d906a2333ca4 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1566,8 +1566,8 @@ int Mdrunner::mdrunner()
                      fr->nbv->gpu_nbv != nullptr
                              ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal)
                              : nullptr;
-            const void*        deviceContext = pme_gpu_get_device_context(fr->pmedata);
-            const int          paddingSize   = pme_gpu_get_padding_size(fr->pmedata);
+            const DeviceContext& deviceContext = *pme_gpu_get_device_context(fr->pmedata);
+            const int            paddingSize   = pme_gpu_get_padding_size(fr->pmedata);
              GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
                                                        ? GpuApiCallBehavior::Async
                                                        : GpuApiCallBehavior::Sync;
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h

index d75cd78ea1e2b742d559580b7d92ccfe5144a019..034e7eb604d1e0c6348b8dd8a45c015b30527315 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -59,6 +59,7 @@
  
  #include "locality.h"
  
+class DeviceContext;
  class GpuEventSynchronizer;
  struct gmx_wallcycle;
  
@@ -100,8 +101,6 @@ public:
       *
       * \todo Make a \p CommandStream visible in the CPU parts of the code so we
       *       will not have to pass a void*.
-     * \todo Make a \p DeviceContext object visible in CPU parts of the code so we
-     *       will not have to pass a void*.
       *
       *  \param[in] pmeStream       Device PME stream, nullptr allowed.
       *  \param[in] localStream     Device NBNXM local stream, nullptr allowed.
@@ -111,13 +110,13 @@ public:
       *  \param[in] paddingSize     Padding size for coordinates buffer.
       *  \param[in] wcycle          Wall cycle counter data.
       */
-    StatePropagatorDataGpu(const void*        pmeStream,
-                           const void*        localStream,
-                           const void*        nonLocalStream,
-                           const void*        deviceContext,
-                           GpuApiCallBehavior transferKind,
-                           int                paddingSize,
-                           gmx_wallcycle*     wcycle);
+    StatePropagatorDataGpu(const void*          pmeStream,
+                           const void*          localStream,
+                           const void*          nonLocalStream,
+                           const DeviceContext& deviceContext,
+                           GpuApiCallBehavior   transferKind,
+                           int                  paddingSize,
+                           gmx_wallcycle*       wcycle);
  
      /*! \brief Constructor to use in PME-only rank and in tests.
       *
@@ -135,11 +134,11 @@ public:
       *  \param[in] paddingSize     Padding size for coordinates buffer.
       *  \param[in] wcycle          Wall cycle counter data.
       */
-    StatePropagatorDataGpu(const void*        pmeStream,
-                           const void*        deviceContext,
-                           GpuApiCallBehavior transferKind,
-                           int                paddingSize,
-                           gmx_wallcycle*     wcycle);
+    StatePropagatorDataGpu(const void*          pmeStream,
+                           const DeviceContext& deviceContext,
+                           GpuApiCallBehavior   transferKind,
+                           int                  paddingSize,
+                           gmx_wallcycle*       wcycle);
  
      //! Move constructor
      StatePropagatorDataGpu(StatePropagatorDataGpu&& other) noexcept;
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp

index 15f054eafaa59354782743d04aab68e2b9d9d84a..1029dd220fa12c72480c338d6cb28d14864d2f54 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -57,7 +57,7 @@ class StatePropagatorDataGpu::Impl
  StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream       */,
                                                 const void* /* localStream     */,
                                                 const void* /* nonLocalStream  */,
-                                               const void* /* deviceContext   */,
+                                               const DeviceContext& /* deviceContext   */,
                                                 GpuApiCallBehavior /* transferKind    */,
                                                 int /* paddingSize     */,
                                                 gmx_wallcycle* /*   wcycle */) :
@@ -66,7 +66,7 @@ StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream       */
  }
  
  StatePropagatorDataGpu::StatePropagatorDataGpu(const void* /* pmeStream       */,
-                                               const void* /* deviceContext   */,
+                                               const DeviceContext& /* deviceContext   */,
                                                 GpuApiCallBehavior /* transferKind    */,
                                                 int /* paddingSize     */,
                                                 gmx_wallcycle* /*   wcycle */) :
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h

index a138bcbb624f6d48f0dba52f1223a9427b4113ad..679bf2544a81e082f124e568272da78cdb3e9cd7 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -101,8 +101,6 @@ public:
       *
       * \todo Make a \p CommandStream visible in the CPU parts of the code so we
       *       will not have to pass a void*.
-     * \todo Make a \p DeviceContext object visible in CPU parts of the code so we
-     *       will not have to pass a void*.
       *
       *  \param[in] pmeStream       Device PME stream, nullptr allowed.
       *  \param[in] localStream     Device NBNXM local stream, nullptr allowed.
@@ -112,13 +110,13 @@ public:
       *  \param[in] paddingSize     Padding size for coordinates buffer.
       *  \param[in] wcycle          Wall cycle counter data.
       */
-    Impl(const void*        pmeStream,
-         const void*        localStream,
-         const void*        nonLocalStream,
-         const void*        deviceContext,
-         GpuApiCallBehavior transferKind,
-         int                paddingSize,
-         gmx_wallcycle*     wcycle);
+    Impl(const void*          pmeStream,
+         const void*          localStream,
+         const void*          nonLocalStream,
+         const DeviceContext& deviceContext,
+         GpuApiCallBehavior   transferKind,
+         int                  paddingSize,
+         gmx_wallcycle*       wcycle);
  
      /*! \brief Constructor to use in PME-only rank and in tests.
       *
@@ -136,11 +134,11 @@ public:
       *  \param[in] paddingSize     Padding size for coordinates buffer.
       *  \param[in] wcycle          Wall cycle counter data.
       */
-    Impl(const void*        pmeStream,
-         const void*        deviceContext,
-         GpuApiCallBehavior transferKind,
-         int                paddingSize,
-         gmx_wallcycle*     wcycle);
+    Impl(const void*          pmeStream,
+         const DeviceContext& deviceContext,
+         GpuApiCallBehavior   transferKind,
+         int                  paddingSize,
+         gmx_wallcycle*       wcycle);
  
      ~Impl();
  
@@ -380,10 +378,8 @@ private:
      //! An array of events that indicate D2H copy of forces is complete (one event for each atom locality)
      EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnHost_;
  
-    /*! \brief GPU context (for OpenCL builds)
-     * \todo Make a Context class usable in CPU code
-     */
-    DeviceContext deviceContext_ = nullptr;
+    //! GPU context (for OpenCL builds)
+    const DeviceContext& deviceContext_;
      //! Default GPU calls behavior
      GpuApiCallBehavior transferKind_ = GpuApiCallBehavior::Async;
      //! Padding size for the coordinates buffer
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index 4b385a5a7b56ca5b82a22c57bb648a127b6b731d..d88f469711dcc5343560df94729a8455e7363d23 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -50,6 +50,7 @@
  #        include "gromacs/gpu_utils/cudautils.cuh"
  #    endif
  #    include "gromacs/gpu_utils/devicebuffer.h"
+#    include "gromacs/gpu_utils/gputraits.h"
  #    if GMX_GPU == GMX_GPU_OPENCL
  #        include "gromacs/gpu_utils/oclutils.h"
  #    endif
@@ -64,13 +65,14 @@
  namespace gmx
  {
  
-StatePropagatorDataGpu::Impl::Impl(const void*        pmeStream,
-                                   const void*        localStream,
-                                   const void*        nonLocalStream,
-                                   const void*        deviceContext,
-                                   GpuApiCallBehavior transferKind,
-                                   int                paddingSize,
-                                   gmx_wallcycle*     wcycle) :
+StatePropagatorDataGpu::Impl::Impl(const void*          pmeStream,
+                                   const void*          localStream,
+                                   const void*          nonLocalStream,
+                                   const DeviceContext& deviceContext,
+                                   GpuApiCallBehavior   transferKind,
+                                   int                  paddingSize,
+                                   gmx_wallcycle*       wcycle) :
+    deviceContext_(deviceContext),
      transferKind_(transferKind),
      paddingSize_(paddingSize),
      wcycle_(wcycle)
@@ -81,13 +83,11 @@ StatePropagatorDataGpu::Impl::Impl(const void*        pmeStream,
      // TODO: Refactor when the StreamManager is introduced.
      if (GMX_GPU == GMX_GPU_OPENCL)
      {
-        GMX_ASSERT(deviceContext != nullptr, "GPU context should be set in OpenCL builds.");
          GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set in OpenCL builds.");
  
          // The update stream is set to the PME stream in OpenCL, since PME stream is the only stream created in the PME context.
-        pmeStream_     = *static_cast<const CommandStream*>(pmeStream);
-        updateStream_  = *static_cast<const CommandStream*>(pmeStream);
-        deviceContext_ = *static_cast<const DeviceContext*>(deviceContext);
+        pmeStream_    = *static_cast<const CommandStream*>(pmeStream);
+        updateStream_ = *static_cast<const CommandStream*>(pmeStream);
          GMX_UNUSED_VALUE(localStream);
          GMX_UNUSED_VALUE(nonLocalStream);
      }
@@ -113,7 +113,6 @@ StatePropagatorDataGpu::Impl::Impl(const void*        pmeStream,
          stat = cudaStreamCreate(&updateStream_);
          CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu");
  #    endif
-        GMX_UNUSED_VALUE(deviceContext);
      }
  
      // Map the atom locality to the stream that will be used for coordinates,
@@ -132,11 +131,12 @@ StatePropagatorDataGpu::Impl::Impl(const void*        pmeStream,
      fCopyStreams_[AtomLocality::All]      = updateStream_;
  }
  
-StatePropagatorDataGpu::Impl::Impl(const void*        pmeStream,
-                                   const void*        deviceContext,
-                                   GpuApiCallBehavior transferKind,
-                                   int                paddingSize,
-                                   gmx_wallcycle*     wcycle) :
+StatePropagatorDataGpu::Impl::Impl(const void*          pmeStream,
+                                   const DeviceContext& deviceContext,
+                                   GpuApiCallBehavior   transferKind,
+                                   int                  paddingSize,
+                                   gmx_wallcycle*       wcycle) :
+    deviceContext_(deviceContext),
      transferKind_(transferKind),
      paddingSize_(paddingSize),
      wcycle_(wcycle)
@@ -144,12 +144,6 @@ StatePropagatorDataGpu::Impl::Impl(const void*        pmeStream,
      static_assert(GMX_GPU != GMX_GPU_NONE,
                    "This object should only be constructed on the GPU code-paths.");
  
-    if (GMX_GPU == GMX_GPU_OPENCL)
-    {
-        GMX_ASSERT(deviceContext != nullptr, "GPU context should be set in OpenCL builds.");
-        deviceContext_ = *static_cast<const DeviceContext*>(deviceContext);
-    }
-
      GMX_ASSERT(pmeStream != nullptr, "GPU PME stream should be set.");
      pmeStream_ = *static_cast<const CommandStream*>(pmeStream);
  
@@ -551,22 +545,22 @@ int StatePropagatorDataGpu::Impl::numAtomsAll()
  }
  
  
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void*        pmeStream,
-                                               const void*        localStream,
-                                               const void*        nonLocalStream,
-                                               const void*        deviceContext,
-                                               GpuApiCallBehavior transferKind,
-                                               int                paddingSize,
-                                               gmx_wallcycle*     wcycle) :
+StatePropagatorDataGpu::StatePropagatorDataGpu(const void*          pmeStream,
+                                               const void*          localStream,
+                                               const void*          nonLocalStream,
+                                               const DeviceContext& deviceContext,
+                                               GpuApiCallBehavior   transferKind,
+                                               int                  paddingSize,
+                                               gmx_wallcycle*       wcycle) :
      impl_(new Impl(pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize, wcycle))
  {
  }
  
-StatePropagatorDataGpu::StatePropagatorDataGpu(const void*        pmeStream,
-                                               const void*        deviceContext,
-                                               GpuApiCallBehavior transferKind,
-                                               int                paddingSize,
-                                               gmx_wallcycle*     wcycle) :
+StatePropagatorDataGpu::StatePropagatorDataGpu(const void*          pmeStream,
+                                               const DeviceContext& deviceContext,
+                                               GpuApiCallBehavior   transferKind,
+                                               int                  paddingSize,
+                                               gmx_wallcycle*       wcycle) :
      impl_(new Impl(pmeStream, deviceContext, transferKind, paddingSize, wcycle))
  {
  }
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu

index fa290e6473e74027bc50fb90cb5750c644491b5c..7467f95b69596f4f0cb63e19195ad510fb43a6a8 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -51,6 +51,7 @@
  
  // TODO Remove this comment when the above order issue is resolved
  #include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/device_context.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
  #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
  #include "gromacs/gpu_utils/pmalloc_cuda.h"
@@ -537,21 +538,21 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
          iTimers.didPairlistH2D = true;
      }
  
-    DeviceContext context = nullptr;
-
-    reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, context);
+    reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
+                           DeviceContext());
      copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream,
                         GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
-    reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, context);
+    reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc,
+                           DeviceContext());
      copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream,
                         GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
      reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
-                           &d_plist->nimask, &d_plist->imask_nalloc, context);
+                           &d_plist->nimask, &d_plist->imask_nalloc, DeviceContext());
  
      reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl,
-                           &d_plist->excl_nalloc, context);
+                           &d_plist->excl_nalloc, DeviceContext());
      copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream,
                         GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
@@ -857,9 +858,9 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv
      const int    maxNumColumns = gridSet.numColumnsMax();
  
      reallocateDeviceBuffer(&gpu_nbv->cxy_na, maxNumColumns * gridSet.grids().size(),
-                           &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, nullptr);
+                           &gpu_nbv->ncxy_na, &gpu_nbv->ncxy_na_alloc, DeviceContext());
      reallocateDeviceBuffer(&gpu_nbv->cxy_ind, maxNumColumns * gridSet.grids().size(),
-                           &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, nullptr);
+                           &gpu_nbv->ncxy_ind, &gpu_nbv->ncxy_ind_alloc, DeviceContext());
  
      for (unsigned int g = 0; g < gridSet.grids().size(); g++)
      {
@@ -873,7 +874,7 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv
          const int* cxy_ind         = grid.cxy_ind().data();
  
          reallocateDeviceBuffer(&gpu_nbv->atomIndices, atomIndicesSize, &gpu_nbv->atomIndicesSize,
-                               &gpu_nbv->atomIndicesSize_alloc, nullptr);
+                               &gpu_nbv->atomIndicesSize_alloc, DeviceContext());
  
          if (atomIndicesSize > 0)
          {
@@ -948,7 +949,8 @@ void nbnxn_gpu_init_add_nbat_f_to_f(const int*                  cell,
  
      if (natoms_total > 0)
      {
-        reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, nullptr);
+        reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc,
+                               DeviceContext());
          copyToDeviceBuffer(&gpu_nbv->cell, cell, 0, natoms_total, stream, GpuApiCallBehavior::Async, nullptr);
      }
  
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp

index 05a270b7f010b78b747a2cd8313943bca470a263..013bd093a01e3aff9f67f7706e199bedcf4aab2c 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -70,6 +70,7 @@
  
  #include "thread_mpi/atomic.h"
  
+#include "gromacs/gpu_utils/device_context.h"
  #include "gromacs/gpu_utils/gputraits_ocl.h"
  #include "gromacs/gpu_utils/oclutils.h"
  #include "gromacs/hardware/hw_info.h"
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp

index 59ee706c4f99c39893db7d0fadba75f32f3e933d..50569242245e851e4f21c03bcb272a01fea7969a 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -134,13 +134,14 @@ static void init_ewald_coulomb_force_table(const EwaldCorrectionTables&     tabl
         array_format.image_channel_data_type = CL_FLOAT;
         array_format.image_channel_order     = CL_R;
  
-       coul_tab = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
-       &array_format, tabsize, 1, 0, ftmp, &cl_error);
+       coul_tab = clCreateImage2D(runData->deviceContext.context(), CL_MEM_READ_WRITE |
+       CL_MEM_COPY_HOST_PTR, &array_format, tabsize, 1, 0, ftmp, &cl_error);
       */
  
-    coul_tab = clCreateBuffer(
-            runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
-            tables.tableF.size() * sizeof(cl_float), const_cast<real*>(tables.tableF.data()), &cl_error);
+    coul_tab = clCreateBuffer(runData->deviceContext.context(),
+                              CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+                              tables.tableF.size() * sizeof(cl_float),
+                              const_cast<real*>(tables.tableF.data()), &cl_error);
      GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                         ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
  
@@ -158,23 +159,24 @@ static void init_atomdata_first(cl_atomdata_t* ad, int ntypes, gmx_device_runtim
  
      ad->ntypes = ntypes;
  
-    ad->shift_vec = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
-                                   SHIFTS * sizeof(nbnxn_atomdata_t::shift_vec[0]), nullptr, &cl_error);
+    ad->shift_vec =
+            clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+                           SHIFTS * sizeof(nbnxn_atomdata_t::shift_vec[0]), nullptr, &cl_error);
      GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                         ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
      ad->bShiftVecUploaded = CL_FALSE;
  
-    ad->fshift = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+    ad->fshift = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
                                  SHIFTS * sizeof(nb_staging_t::fshift[0]), nullptr, &cl_error);
      GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                         ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
  
-    ad->e_lj = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+    ad->e_lj = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
                                sizeof(float), nullptr, &cl_error);
      GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                         ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
  
-    ad->e_el = clCreateBuffer(runData->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+    ad->e_el = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
                                sizeof(float), nullptr, &cl_error);
      GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                         ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
@@ -330,11 +332,11 @@ static void init_nbparam(cl_nbparam_t*                    nbp,
             array_format.image_channel_data_type = CL_FLOAT;
             array_format.image_channel_order     = CL_R;
  
-           nbp->coulomb_tab_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
-            &array_format, 1, 1, 0, nullptr, &cl_error);
+           nbp->coulomb_tab_climg2d = clCreateImage2D(runData->deviceContext.context(),
+           CL_MEM_READ_WRITE, &array_format, 1, 1, 0, nullptr, &cl_error);
           */
  
-        nbp->coulomb_tab_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY,
+        nbp->coulomb_tab_climg2d = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_ONLY,
                                                    sizeof(cl_float), nullptr, &cl_error);
          GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                             ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
@@ -352,12 +354,13 @@ static void init_nbparam(cl_nbparam_t*                    nbp,
             array_format.image_channel_data_type = CL_FLOAT;
             array_format.image_channel_order     = CL_R;
  
-           nbp->nbfp_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_ONLY |
+           nbp->nbfp_climg2d = clCreateImage2D(runData->deviceContext.context(), CL_MEM_READ_ONLY |
             CL_MEM_COPY_HOST_PTR, &array_format, nnbfp, 1, 0, nbat->nbfp, &cl_error);
           */
  
          nbp->nbfp_climg2d = clCreateBuffer(
-                runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+                runData->deviceContext.context(),
+                CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
                  nnbfp * sizeof(cl_float), const_cast<float*>(nbatParams.nbfp.data()), &cl_error);
          GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                             ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
@@ -366,12 +369,13 @@ static void init_nbparam(cl_nbparam_t*                    nbp,
          {
              /* Switched from using textures to using buffers */
              // TODO: decide which alternative is most efficient - textures or buffers.
-            /*  nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE |
+            /*  nbp->nbfp_comb_climg2d = clCreateImage2D(runData->deviceContext.context(), CL_MEM_READ_WRITE |
                 CL_MEM_COPY_HOST_PTR, &array_format, nnbfp_comb, 1, 0, nbat->nbfp_comb, &cl_error);*/
-            nbp->nbfp_comb_climg2d = clCreateBuffer(
-                    runData->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
-                    nnbfp_comb * sizeof(cl_float), const_cast<float*>(nbatParams.nbfp_comb.data()),
-                    &cl_error);
+            nbp->nbfp_comb_climg2d =
+                    clCreateBuffer(runData->deviceContext.context(),
+                                   CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+                                   nnbfp_comb * sizeof(cl_float),
+                                   const_cast<float*>(nbatParams.nbfp_comb.data()), &cl_error);
              GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                                 ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
          }
@@ -382,9 +386,9 @@ static void init_nbparam(cl_nbparam_t*                    nbp,
              // don't accept nullptr values for image2D parameters.
              /* Switched from using textures to using buffers */
              // TODO: decide which alternative is most efficient - textures or buffers.
-            /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
-                &array_format, 1, 1, 0, nullptr, &cl_error);*/
-            nbp->nbfp_comb_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY,
+            /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->deviceContext.context(),
+               CL_MEM_READ_WRITE, &array_format, 1, 1, 0, nullptr, &cl_error);*/
+            nbp->nbfp_comb_climg2d = clCreateBuffer(runData->deviceContext.context(), CL_MEM_READ_ONLY,
                                                      sizeof(cl_float), nullptr, &cl_error);
              GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                                 ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
@@ -515,7 +519,7 @@ static void nbnxn_gpu_create_context(gmx_device_runtime_data_t* runtimeData,
                    rank, deviceInfo->device_name, cl_error, ocl_get_error_string(cl_error).c_str());
      }
  
-    runtimeData->context = context;
+    runtimeData->deviceContext.setContext(context);
  }
  
  /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
@@ -636,8 +640,8 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
      snew(nb->timings, 1);
  
      /* set device info, just point it to the right GPU among the detected ones */
-    nb->deviceInfo = deviceInfo;
-    snew(nb->dev_rundata, 1);
+    nb->deviceInfo  = deviceInfo;
+    nb->dev_rundata = new gmx_device_runtime_data_t();
  
      /* init nbst */
      pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
@@ -662,8 +666,9 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
      nbnxn_gpu_create_context(nb->dev_rundata, nb->deviceInfo, rank);
  
      /* local/non-local GPU streams */
-    nb->stream[InteractionLocality::Local] = clCreateCommandQueue(
-            nb->dev_rundata->context, nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
+    nb->stream[InteractionLocality::Local] =
+            clCreateCommandQueue(nb->dev_rundata->deviceContext.context(),
+                                 nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
      if (CL_SUCCESS != cl_error)
      {
          gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d", rank,
@@ -674,8 +679,9 @@ NbnxmGpu* gpu_init(const DeviceInformation*   deviceInfo,
      {
          init_plist(nb->plist[InteractionLocality::NonLocal]);
  
-        nb->stream[InteractionLocality::NonLocal] = clCreateCommandQueue(
-                nb->dev_rundata->context, nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
+        nb->stream[InteractionLocality::NonLocal] =
+                clCreateCommandQueue(nb->dev_rundata->deviceContext.context(),
+                                     nb->deviceInfo->oclDeviceId, queue_properties, &cl_error);
          if (CL_SUCCESS != cl_error)
          {
              gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
@@ -788,21 +794,23 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
      }
  
      // TODO most of this function is same in CUDA and OpenCL, move into the header
-    DeviceContext context = nb->dev_rundata->context;
+    const DeviceContext& deviceContext = nb->dev_rundata->deviceContext;
  
-    reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, context);
+    reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
+                           deviceContext);
      copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), stream,
                         GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
-    reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, context);
+    reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc,
+                           deviceContext);
      copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), stream,
                         GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
      reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
-                           &d_plist->nimask, &d_plist->imask_nalloc, context);
+                           &d_plist->nimask, &d_plist->imask_nalloc, deviceContext);
  
      reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl,
-                           &d_plist->excl_nalloc, context);
+                           &d_plist->excl_nalloc, deviceContext);
      copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), stream,
                         GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
@@ -865,19 +873,21 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
              freeDeviceBuffer(&d_atdat->atom_types);
          }
  
-        d_atdat->f = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
+        d_atdat->f = clCreateBuffer(nb->dev_rundata->deviceContext.context(),
+                                    CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY,
                                      nalloc * DIM * sizeof(nbat->out[0].f[0]), nullptr, &cl_error);
          GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                             ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
  
-        d_atdat->xq = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
+        d_atdat->xq = clCreateBuffer(nb->dev_rundata->deviceContext.context(),
+                                     CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
                                       nalloc * sizeof(cl_float4), nullptr, &cl_error);
          GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                             ("clCreateBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
  
          if (useLjCombRule(nb->nbparam->vdwtype))
          {
-            d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->context,
+            d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->deviceContext.context(),
                                                CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
                                                nalloc * sizeof(cl_float2), nullptr, &cl_error);
              GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
@@ -885,7 +895,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
          }
          else
          {
-            d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->context,
+            d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->deviceContext.context(),
                                                   CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY,
                                                   nalloc * sizeof(int), nullptr, &cl_error);
              GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
@@ -969,19 +979,9 @@ static void free_gpu_device_runtime_data(gmx_device_runtime_data_t* runData)
          return;
      }
  
-    cl_int gmx_unused cl_error;
-
-    if (runData->context)
-    {
-        cl_error = clReleaseContext(runData->context);
-        GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
-                           ("clReleaseContext failed: " + ocl_get_error_string(cl_error)).c_str());
-        runData->context = nullptr;
-    }
-
      if (runData->program)
      {
-        cl_error = clReleaseProgram(runData->program);
+        cl_int cl_error = clReleaseProgram(runData->program);
          GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS,
                             ("clReleaseProgram failed: " + ocl_get_error_string(cl_error)).c_str());
          runData->program = nullptr;
@@ -1076,7 +1076,7 @@ void gpu_free(NbnxmGpu* nb)
      }
  
      free_gpu_device_runtime_data(nb->dev_rundata);
-    sfree(nb->dev_rundata);
+    delete nb->dev_rundata;
  
      /* Free timers and timings */
      delete nb->timers;
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp

index 8a4e217d84d56139653477897e5013ad2befe800..0ba3345780533b3f7d5881fc81c814e995324b6e 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp
@@ -200,9 +200,10 @@ void nbnxn_gpu_compile_kernels(NbnxmGpu* nb)
          {
              /* TODO when we have a proper MPI-aware logging module,
                 the log output here should be written there */
-            program = gmx::ocl::compileProgram(
-                    stderr, "gromacs/nbnxm/opencl", "nbnxm_ocl_kernels.cl", extraDefines,
-                    nb->dev_rundata->context, nb->deviceInfo->oclDeviceId, nb->deviceInfo->deviceVendor);
+            program =
+                    gmx::ocl::compileProgram(stderr, "gromacs/nbnxm/opencl", "nbnxm_ocl_kernels.cl",
+                                             extraDefines, nb->dev_rundata->deviceContext.context(),
+                                             nb->deviceInfo->oclDeviceId, nb->deviceInfo->deviceVendor);
          }
          catch (gmx::GromacsException& e)
          {
author	Artem Zhmurov <zhmurov@gmail.com>
	Wed, 29 Jan 2020 15:59:33 +0000 (16:59 +0100)
committer	Christian Blau <cblau@gerrit.gromacs.org>
	Mon, 9 Mar 2020 17:33:41 +0000 (18:33 +0100)
src/gromacs/domdec/gpuhaloexchange_impl.cu		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cuh		patch \| blob \| history
src/gromacs/ewald/pme.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_internal.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu_program_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gpu_types_host_impl.h		patch \| blob \| history
src/gromacs/ewald/pme_only.cpp		patch \| blob \| history
src/gromacs/ewald/pme_pp_comm_gpu_impl.cu		patch \| blob \| history
src/gromacs/ewald/tests/pmetestcommon.cpp		patch \| blob \| history
src/gromacs/gpu_utils/CMakeLists.txt		patch \| blob \| history
src/gromacs/gpu_utils/device_context.h	[new file with mode: 0644]	patch \| blob
src/gromacs/gpu_utils/device_context_ocl.cpp	[new file with mode: 0644]	patch \| blob
src/gromacs/gpu_utils/device_context_ocl.h	[new file with mode: 0644]	patch \| blob
src/gromacs/gpu_utils/devicebuffer.cuh		patch \| blob \| history
src/gromacs/gpu_utils/devicebuffer.h		patch \| blob \| history
src/gromacs/gpu_utils/devicebuffer_ocl.h		patch \| blob \| history
src/gromacs/gpu_utils/gputraits.cuh		patch \| blob \| history
src/gromacs/gpu_utils/gputraits.h		patch \| blob \| history
src/gromacs/gpu_utils/gputraits_ocl.h		patch \| blob \| history
src/gromacs/gpu_utils/oclutils.h		patch \| blob \| history
src/gromacs/gpu_utils/tests/typecasts_runner.cu		patch \| blob \| history
src/gromacs/listed_forces/gpubonded_impl.cu		patch \| blob \| history
src/gromacs/listed_forces/gpubonded_impl.h		patch \| blob \| history
src/gromacs/mdlib/leapfrog_gpu.cu		patch \| blob \| history
src/gromacs/mdlib/leapfrog_gpu.cuh		patch \| blob \| history
src/gromacs/mdlib/lincs_gpu.cu		patch \| blob \| history
src/gromacs/mdlib/lincs_gpu.cuh		patch \| blob \| history
src/gromacs/mdlib/settle_gpu.cu		patch \| blob \| history
src/gromacs/mdlib/settle_gpu.cuh		patch \| blob \| history
src/gromacs/mdlib/tests/constrtestrunners.cu		patch \| blob \| history
src/gromacs/mdlib/tests/leapfrogtestrunners.cu		patch \| blob \| history
src/gromacs/mdlib/tests/settletestrunners.cu		patch \| blob \| history
src/gromacs/mdlib/update_constrain_gpu_impl.cu		patch \| blob \| history
src/gromacs/mdlib/update_constrain_gpu_impl.h		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_jit_support.cpp		patch \| blob \| history