Fix multiple tMPI ranks per OpenCL device

author Szilárd Páll <pall.szilard@gmail.com>

Fri, 18 Mar 2016 01:12:18 +0000 (02:12 +0100)

committer Szilárd Páll <pall.szilard@gmail.com>

Thu, 31 Mar 2016 00:05:24 +0000 (02:05 +0200)
author Szilárd Páll <pall.szilard@gmail.com>
Fri, 18 Mar 2016 01:12:18 +0000 (02:12 +0100)
committer Szilárd Páll <pall.szilard@gmail.com>
Thu, 31 Mar 2016 00:05:24 +0000 (02:05 +0200)
diff --git a/docs/user-guide/mdrun-performance.rst b/docs/user-guide/mdrun-performance.rst

index f0cd19654261add1297ee34b3b1e1701bd5a3ae5..9bbd4d2c62ae591076e109253106bad54214f146 100644 (file)
--- a/docs/user-guide/mdrun-performance.rst
+++ b/docs/user-guide/mdrun-performance.rst
@@ -554,7 +554,6 @@ Known limitations of the OpenCL support
  Limitations in the current OpenCL support of interest to |Gromacs| users:
  
  - Using more than one GPU on a node is supported only with thread MPI
-- Sharing a GPU between multiple PP ranks is not supported
  - No Intel devices (CPUs, GPUs or Xeon Phi) are supported
  - Due to blocking behavior of some asynchronous task enqueuing functions
    in the NVIDIA OpenCL runtime, with the affected driver versions there is
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh

index 1fa80b4c44e6d1e475d7d501e80804ad25f053e3..495b1a4d291ce32e853d01d91a4cf49ad3c1d155 100644 (file)
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -107,7 +107,12 @@
  
  #endif /* CHECK_CUDA_ERRORS */
  
-/*! CUDA device information. */
+/*! \brief CUDA device information.
+ *
+ * The CUDA device information is queried and set at detection and contains
+ * both information about the device/hardware returned by the runtime as well
+ * as additional data like support status.
+ */
  struct gmx_device_info_t
  {
      int                 id;                     /* id of the CUDA device */
diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp

index 2084d8cca5639065c44bdff375a3d5507d680a7e..3cd6cf0941e7a50819b59cb1dcddc66bc59622d9 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
+++ b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
@@ -339,29 +339,12 @@ int detect_gpus(gmx_gpu_info_t *gpu_info, char *err_str)
  //! This function is documented in the header file
  void free_gpu_info(const gmx_gpu_info_t gmx_unused *gpu_info)
  {
-    if (gpu_info)
+    if (gpu_info == NULL)
      {
-        for (int i = 0; i < gpu_info->n_dev; i++)
-        {
-            cl_int gmx_unused cl_error;
-
-            if (gpu_info->gpu_dev[i].context)
-            {
-                cl_error                     = clReleaseContext(gpu_info->gpu_dev[i].context);
-                gpu_info->gpu_dev[i].context = NULL;
-                assert(CL_SUCCESS == cl_error);
-            }
-
-            if (gpu_info->gpu_dev[i].program)
-            {
-                cl_error                     = clReleaseProgram(gpu_info->gpu_dev[i].program);
-                gpu_info->gpu_dev[i].program = NULL;
-                assert(CL_SUCCESS == cl_error);
-            }
-        }
-
-        sfree(gpu_info->gpu_dev);
+        return;
      }
+
+    sfree(gpu_info->gpu_dev);
  }
  
  //! This function is documented in the header file
diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h

index 365b91328a2a1d4a9257404221bafa3bd715e8ed..e4913b6181617ad692673eddbca912c47717bcc2 100644 (file)
--- a/src/gromacs/gpu_utils/oclutils.h
+++ b/src/gromacs/gpu_utils/oclutils.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2016, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -59,7 +59,9 @@ typedef enum {
      OCL_VENDOR_UNKNOWN
  } ocl_vendor_id_t;
  
-/*! \internal \brief OpenCL GPU device identificator
+/*! \internal
+ * \brief OpenCL GPU device identificator
+ *
   * An OpenCL device is identified by its ID.
   * The platform ID is also included for caching reasons.
   */
@@ -69,31 +71,39 @@ typedef struct
      cl_device_id        ocl_device_id;   /**< Device ID */
  } ocl_gpu_id_t;
  
-/*! \internal \brief OpenCL GPU information
- *
- * \todo Move context and program outside this data structure.
- * They are specific to a certain usage of the device (e.g. with/without OpenGL
- * interop) and do not provide general device information as the data structure
- * name indicates.
+/*! \internal
+ * \brief OpenCL device information.
   *
- * TODO Document fields
+ * The OpenCL device information is queried and set at detection and contains
+ * both information about the device/hardware returned by the runtime as well
+ * as additional data like support status.
   */
  struct gmx_device_info_t
  {
-    //! @cond Doxygen_Suppress
-    ocl_gpu_id_t        ocl_gpu_id;
-    char                device_name[256];
-    char                device_version[256];
-    char                device_vendor[256];
-    int                 compute_units;
-    int                 adress_bits;
-    int                 stat;
-    ocl_vendor_id_t     vendor_e;
-
-    cl_context          context;
-    cl_program          program;
-    //! @endcond Doxygen_Suppress
+    ocl_gpu_id_t        ocl_gpu_id;          /**< device ID assigned at detection   */
+    char                device_name[256];    /**< device name */
+    char                device_version[256]; /**< device version */
+    char                device_vendor[256];  /**< device vendor */
+    int                 compute_units;       /**< number of compute units */
+    int                 adress_bits;         /**< number of adress bits the device is capable of */
+    int                 stat;                /**< device status takes values of e_gpu_detect_res_t */
+    ocl_vendor_id_t     vendor_e;            /**< device vendor as defined by ocl_vendor_id_t */
+};
  
+/*! \internal
+ * \brief OpenCL GPU runtime data
+ *
+ * The device runtime data is meant to hold objects associated with a GROMACS rank's
+ * (thread or process) use of a single device (multiple devices per rank is not
+ * implemented). These objects should be constructed at ther point where a device
+ * dets assigned to a rank and released at when this assignment is no longer valid
+ * (i.e. at cleanup in the current implementation).
+ *
+ */
+struct gmx_device_runtime_data_t
+{
+    cl_context context; /**< OpenCL context */
+    cl_program program; /**< OpenCL program */
  };
  
  #if !defined(NDEBUG)
diff --git a/src/gromacs/hardware/detecthardware.cpp b/src/gromacs/hardware/detecthardware.cpp

index a17a373c07ce41fd3873634d10798100eaba8b95..91715e6fd42762dc5c85e8902940d6257b4a6e0d 100644 (file)
--- a/src/gromacs/hardware/detecthardware.cpp
+++ b/src/gromacs/hardware/detecthardware.cpp
@@ -86,10 +86,10 @@ static const bool bGPUBinary = GMX_GPU != GMX_GPU_NONE;
   * enumeration" in src/config.h.cmakein, so that GMX_GPU looks up an
   * array entry. */
  
-/* CUDA supports everything. Our current OpenCL implementation only
- * supports using exactly one GPU per PP rank, so sharing is
- * impossible */
-static const bool gpuSharingSupport[] = { false, true, false };
+/* Both CUDA and OpenCL (on the supported/tested platforms) supports
+ * GPU device sharing.
+ */
+static const bool gpuSharingSupport[] = { false, true, true };
  static const bool bGpuSharingSupported = gpuSharingSupport[GMX_GPU];
  
  /* CUDA supports everything. Our current OpenCL implementation seems
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp

index 56ae3bf98d8a082aa3573f5429053c5d3ce65665..be8afc1ee1ead6ea78e864621ed8c01177be9389 100644 (file)
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -3233,8 +3233,12 @@ void free_gpu_resources(const t_forcerec     *fr,
          nbnxn_gpu_free(fr->nbv->gpu_nbv);
  
          /* With tMPI we need to wait for all ranks to finish deallocation before
-         * destroying the context in free_gpu() as some ranks may be sharing
+         * destroying the CUDA context in free_gpu() as some tMPI ranks may be sharing
           * GPU and context.
+         *
+         * This is not a concern in OpenCL where we use one context per rank which
+         * is freed in nbnxn_gpu_free().
+         *
           * Note: as only PP ranks need to free GPU resources, so it is safe to
           * not call the barrier on PME ranks.
           */
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp

index 0b8b74d0ddfad2c956b80f10a589d902f5928cd1..b3218f3901fb21ad3f724848a7fb6b2745fff61f 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
@@ -238,7 +238,7 @@ static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t   *nb,
  
      if (NULL == kernel_ptr[0])
      {
-        *kernel_ptr = clCreateKernel(nb->dev_info->program, kernel_name_to_run, &cl_error);
+        *kernel_ptr = clCreateKernel(nb->dev_rundata->program, kernel_name_to_run, &cl_error);
          assert(cl_error == CL_SUCCESS);
      }
      // TODO: handle errors
@@ -537,7 +537,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
  
              if (NULL == nb->debug_buffer)
              {
-                nb->debug_buffer = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                nb->debug_buffer = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                                                    debug_buffer_size, debug_buffer_h, &cl_error);
  
                  assert(CL_SUCCESS == cl_error);
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp

index 4a87625f5e4d36a138d1816799961d0702db7d22..fa76c745b2315f6231a7f88d15ae2340467eb132 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp
@@ -141,8 +141,6 @@ void ocl_realloc_buffered(cl_mem *d_dest, void *h_src,
                            bool bAsync = true,
                            cl_event *copy_event = NULL)
  {
-    cl_int cl_error;
-
      if (d_dest == NULL || req_size < 0)
      {
          return;
@@ -152,6 +150,8 @@ void ocl_realloc_buffered(cl_mem *d_dest, void *h_src,
         than the current requested size */
      if (req_size > *curr_alloc_size)
      {
+        cl_int gmx_unused cl_error;
+
          /* only free if the array has already been initialized */
          if (*curr_alloc_size >= 0)
          {
@@ -203,9 +203,9 @@ static void free_ocl_buffer(cl_mem *buffer)
   * If called with an already allocated table, it just re-uploads the
   * table.
   */
-static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
-                                           cl_nbparam_t              *nbp,
-                                           const gmx_device_info_t   *dev_info)
+static void init_ewald_coulomb_force_table(const interaction_const_t       *ic,
+                                           cl_nbparam_t                    *nbp,
+                                           const gmx_device_runtime_data_t *runData)
  {
      cl_mem       coul_tab;
  
@@ -224,11 +224,11 @@ static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
         array_format.image_channel_data_type = CL_FLOAT;
         array_format.image_channel_order     = CL_R;
  
-       coul_tab = clCreateImage2D(dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+       coul_tab = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
         &array_format, tabsize, 1, 0, ftmp, &cl_error);
       */
  
-    coul_tab = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ic->tabq_size*sizeof(cl_float), ic->tabq_coul_F, &cl_error);
+    coul_tab = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ic->tabq_size*sizeof(cl_float), ic->tabq_coul_F, &cl_error);
      assert(cl_error == CL_SUCCESS);
      // TODO: handle errors, check clCreateBuffer flags
  
@@ -241,7 +241,7 @@ static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
  /*! \brief Initializes the atomdata structure first time, it only gets filled at
      pair-search.
   */
-static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_info_t *dev_info)
+static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_runtime_data_t *runData)
  {
      cl_int cl_error;
  
@@ -252,7 +252,7 @@ static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_info_t
      ad->shift_vec_elem_size = sizeof(*(((nbnxn_atomdata_t*)0)->shift_vec));
  
      // TODO: handle errors, check clCreateBuffer flags
-    ad->shift_vec = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, SHIFTS * ad->shift_vec_elem_size, NULL, &cl_error);
+    ad->shift_vec = clCreateBuffer(runData->context, CL_MEM_READ_WRITE, SHIFTS * ad->shift_vec_elem_size, NULL, &cl_error);
      assert(cl_error == CL_SUCCESS);
      ad->bShiftVecUploaded = false;
  
@@ -260,15 +260,15 @@ static void init_atomdata_first(cl_atomdata_t *ad, int ntypes, gmx_device_info_t
         of the host side fshift buffer. */
      ad->fshift_elem_size = sizeof(*(((cl_nb_staging_t*)0)->fshift));
  
-    ad->fshift = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, SHIFTS * ad->fshift_elem_size, NULL, &cl_error);
+    ad->fshift = clCreateBuffer(runData->context, CL_MEM_READ_WRITE, SHIFTS * ad->fshift_elem_size, NULL, &cl_error);
      assert(cl_error == CL_SUCCESS);
      // TODO: handle errors, check clCreateBuffer flags
  
-    ad->e_lj = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, sizeof(float), NULL, &cl_error);
+    ad->e_lj = clCreateBuffer(runData->context, CL_MEM_READ_WRITE, sizeof(float), NULL, &cl_error);
      assert(cl_error == CL_SUCCESS);
      // TODO: handle errors, check clCreateBuffer flags
  
-    ad->e_el = clCreateBuffer(dev_info->context, CL_MEM_READ_WRITE, sizeof(float), NULL, &cl_error);
+    ad->e_el = clCreateBuffer(runData->context, CL_MEM_READ_WRITE, sizeof(float), NULL, &cl_error);
      assert(cl_error == CL_SUCCESS);
      // TODO: handle errors, check clCreateBuffer flags
  
@@ -387,10 +387,10 @@ map_interaction_types_to_gpu_kernel_flavors(const interaction_const_t *ic,
  
  /*! \brief Initializes the nonbonded parameter data structure.
   */
-static void init_nbparam(cl_nbparam_t              *nbp,
-                         const interaction_const_t *ic,
-                         const nbnxn_atomdata_t    *nbat,
-                         const gmx_device_info_t   *dev_info)
+static void init_nbparam(cl_nbparam_t                    *nbp,
+                         const interaction_const_t       *ic,
+                         const nbnxn_atomdata_t          *nbat,
+                         const gmx_device_runtime_data_t *runData)
  {
      int         ntypes, nnbfp, nnbfp_comb;
      cl_int      cl_error;
@@ -420,7 +420,7 @@ static void init_nbparam(cl_nbparam_t              *nbp,
      nbp->coulomb_tab_climg2d = NULL;
      if (nbp->eeltype == eelOclEWALD_TAB || nbp->eeltype == eelOclEWALD_TAB_TWIN)
      {
-        init_ewald_coulomb_force_table(ic, nbp, dev_info);
+        init_ewald_coulomb_force_table(ic, nbp, runData);
      }
      else
      // TODO: improvement needed.
@@ -435,11 +435,11 @@ static void init_nbparam(cl_nbparam_t              *nbp,
             array_format.image_channel_data_type = CL_FLOAT;
             array_format.image_channel_order     = CL_R;
  
-           nbp->coulomb_tab_climg2d = clCreateImage2D(dev_info->context, CL_MEM_READ_WRITE,
+           nbp->coulomb_tab_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
              &array_format, 1, 1, 0, NULL, &cl_error);
           */
  
-        nbp->coulomb_tab_climg2d = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY, sizeof(cl_float), NULL, &cl_error);
+        nbp->coulomb_tab_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY, sizeof(cl_float), NULL, &cl_error);
          // TODO: handle errors
      }
  
@@ -455,11 +455,11 @@ static void init_nbparam(cl_nbparam_t              *nbp,
             array_format.image_channel_data_type = CL_FLOAT;
             array_format.image_channel_order     = CL_R;
  
-           nbp->nbfp_climg2d = clCreateImage2D(dev_info->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+           nbp->nbfp_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
              &array_format, nnbfp, 1, 0, nbat->nbfp, &cl_error);
           */
  
-        nbp->nbfp_climg2d = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nnbfp*sizeof(cl_float), nbat->nbfp, &cl_error);
+        nbp->nbfp_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nnbfp*sizeof(cl_float), nbat->nbfp, &cl_error);
          assert(cl_error == CL_SUCCESS);
          // TODO: handle errors
  
@@ -467,9 +467,9 @@ static void init_nbparam(cl_nbparam_t              *nbp,
          {
              /* Switched from using textures to using buffers */
              // TODO: decide which alternative is most efficient - textures or buffers.
-            /*  nbp->nbfp_comb_climg2d = clCreateImage2D(dev_info->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+            /*  nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                  &array_format, nnbfp_comb, 1, 0, nbat->nbfp_comb, &cl_error);*/
-            nbp->nbfp_comb_climg2d = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nnbfp_comb*sizeof(cl_float), nbat->nbfp_comb, &cl_error);
+            nbp->nbfp_comb_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nnbfp_comb*sizeof(cl_float), nbat->nbfp_comb, &cl_error);
  
  
              assert(cl_error == CL_SUCCESS);
@@ -482,9 +482,9 @@ static void init_nbparam(cl_nbparam_t              *nbp,
              // don't accept NULL values for image2D parameters.
              /* Switched from using textures to using buffers */
              // TODO: decide which alternative is most efficient - textures or buffers.
-            /* nbp->nbfp_comb_climg2d = clCreateImage2D(dev_info->context, CL_MEM_READ_WRITE,
+            /* nbp->nbfp_comb_climg2d = clCreateImage2D(runData->context, CL_MEM_READ_WRITE,
                  &array_format, 1, 1, 0, NULL, &cl_error);*/
-            nbp->nbfp_comb_climg2d = clCreateBuffer(dev_info->context, CL_MEM_READ_ONLY, sizeof(cl_float), NULL, &cl_error);
+            nbp->nbfp_comb_climg2d = clCreateBuffer(runData->context, CL_MEM_READ_ONLY, sizeof(cl_float), NULL, &cl_error);
  
  
              assert(cl_error == CL_SUCCESS);
@@ -508,7 +508,7 @@ void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t    *nbv,
  
      nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw);
  
-    init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_info);
+    init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_rundata);
  }
  
  /*! \brief Initializes the pair list data structure.
@@ -564,12 +564,13 @@ static void init_timings(gmx_wallclock_gpu_t *t)
   *
   * A fatal error results if creation fails.
   *
- * \param[inout] nb        Manages OpenCL non-bonded calculations;
- *                         contexts returned in dev_info members
- * \param[in]    rank      MPI rank (for error reporting)
+ * \param[inout] runtimeData runtime data including program and context
+ * \param[in]    devInfo     device info struct
+ * \param[in]    rank        MPI rank (for error reporting)
   */
  static void
-nbnxn_gpu_create_context(gmx_nbnxn_ocl_t           *nb,
+nbnxn_gpu_create_context(gmx_device_runtime_data_t *runtimeData,
+                         const gmx_device_info_t   *devInfo,
                           int                        rank)
  {
      cl_context_properties     context_properties[3];
@@ -578,8 +579,11 @@ nbnxn_gpu_create_context(gmx_nbnxn_ocl_t           *nb,
      cl_context                context;
      cl_int                    cl_error;
  
-    platform_id      = nb->dev_info->ocl_gpu_id.ocl_platform_id;
-    device_id        = nb->dev_info->ocl_gpu_id.ocl_device_id;
+    assert(runtimeData != NULL);
+    assert(devInfo != NULL);
+
+    platform_id      = devInfo->ocl_gpu_id.ocl_platform_id;
+    device_id        = devInfo->ocl_gpu_id.ocl_device_id;
  
      context_properties[0] = CL_CONTEXT_PLATFORM;
      context_properties[1] = (cl_context_properties) platform_id;
@@ -588,14 +592,14 @@ nbnxn_gpu_create_context(gmx_nbnxn_ocl_t           *nb,
      context = clCreateContext(context_properties, 1, &device_id, NULL, NULL, &cl_error);
      if (CL_SUCCESS != cl_error)
      {
-        gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
+        gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s:\n OpenCL error %d: %s",
                    rank,
-                  nb->dev_info->device_name,
-                  cl_error);
+                  devInfo->device_name,
+                  cl_error, ocl_get_error_string(cl_error));
          return;
      }
  
-    nb->dev_info->context = context;
+    runtimeData->context = context;
  }
  
  /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
@@ -605,7 +609,7 @@ static cl_kernel nbnxn_gpu_create_kernel(gmx_nbnxn_ocl_t *nb,
      cl_kernel kernel;
      cl_int    cl_error;
  
-    kernel = clCreateKernel(nb->dev_info->program, kernel_name, &cl_error);
+    kernel = clCreateKernel(nb->dev_rundata->program, kernel_name, &cl_error);
      if (CL_SUCCESS != cl_error)
      {
          gmx_fatal(FARGS, "Failed to create kernel '%s' for GPU #%s: OpenCL error %d",
@@ -676,10 +680,11 @@ static void nbnxn_ocl_init_const(gmx_nbnxn_ocl_t                *nb,
                                   const interaction_const_t      *ic,
                                   const nonbonded_verlet_group_t *nbv_group)
  {
-    init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype, nb->dev_info);
-    init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_info);
+    init_atomdata_first(nb->atdat, nbv_group[0].nbat->ntype, nb->dev_rundata);
+    init_nbparam(nb->nbparam, ic, nbv_group[0].nbat, nb->dev_rundata);
  }
  
+
  //! This function is documented in the header file
  void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
                      const gmx_gpu_info_t      *gpu_info,
@@ -719,6 +724,7 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
  
      /* set device info, just point it to the right GPU among the detected ones */
      nb->dev_info = gpu_info->gpu_dev + gpu_opt->dev_use[my_gpu_index];
+    snew(nb->dev_rundata, 1);
  
      /* init to NULL the debug buffer */
      nb->debug_buffer = NULL;
@@ -743,10 +749,10 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
          queue_properties = 0;
      }
  
-    nbnxn_gpu_create_context(nb, rank);
+    nbnxn_gpu_create_context(nb->dev_rundata, nb->dev_info, rank);
  
      /* local/non-local GPU streams */
-    nb->stream[eintLocal] = clCreateCommandQueue(nb->dev_info->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
+    nb->stream[eintLocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
      if (CL_SUCCESS != cl_error)
      {
          gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
@@ -760,7 +766,7 @@ void nbnxn_gpu_init(gmx_nbnxn_ocl_t          **p_nb,
      {
          init_plist(nb->plist[eintNonlocal]);
  
-        nb->stream[eintNonlocal] = clCreateCommandQueue(nb->dev_info->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
+        nb->stream[eintNonlocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error);
          if (CL_SUCCESS != cl_error)
          {
              gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d",
@@ -881,19 +887,19 @@ void nbnxn_gpu_init_pairlist(gmx_nbnxn_ocl_t        *nb,
      ocl_realloc_buffered(&d_plist->sci, h_plist->sci, sizeof(nbnxn_sci_t),
                           &d_plist->nsci, &d_plist->sci_nalloc,
                           h_plist->nsci,
-                         nb->dev_info->context,
+                         nb->dev_rundata->context,
                           stream, true, &(nb->timers->pl_h2d_sci[iloc]));
  
      ocl_realloc_buffered(&d_plist->cj4, h_plist->cj4, sizeof(nbnxn_cj4_t),
                           &d_plist->ncj4, &d_plist->cj4_nalloc,
                           h_plist->ncj4,
-                         nb->dev_info->context,
+                         nb->dev_rundata->context,
                           stream, true, &(nb->timers->pl_h2d_cj4[iloc]));
  
      ocl_realloc_buffered(&d_plist->excl, h_plist->excl, sizeof(nbnxn_excl_t),
                           &d_plist->nexcl, &d_plist->excl_nalloc,
                           h_plist->nexcl,
-                         nb->dev_info->context,
+                         nb->dev_rundata->context,
                           stream, true, &(nb->timers->pl_h2d_excl[iloc]));
  
      /* need to prune the pair list during the next step */
@@ -949,25 +955,25 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_ocl_t               *nb,
          d_atdat->f_elem_size = sizeof(rvec);
  
          // TODO: handle errors, check clCreateBuffer flags
-        d_atdat->f = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE, nalloc * d_atdat->f_elem_size, NULL, &cl_error);
+        d_atdat->f = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * d_atdat->f_elem_size, NULL, &cl_error);
          assert(CL_SUCCESS == cl_error);
  
          // TODO: change the flag to read-only
-        d_atdat->xq = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE, nalloc * sizeof(cl_float4), NULL, &cl_error);
+        d_atdat->xq = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * sizeof(cl_float4), NULL, &cl_error);
          assert(CL_SUCCESS == cl_error);
          // TODO: handle errors, check clCreateBuffer flags
  
          if (useLjCombRule(nb->nbparam->vdwtype))
          {
              // TODO: change the flag to read-only
-            d_atdat->lj_comb = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE, nalloc * sizeof(cl_float2), NULL, &cl_error);
+            d_atdat->lj_comb = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * sizeof(cl_float2), NULL, &cl_error);
              assert(CL_SUCCESS == cl_error);
              // TODO: handle errors, check clCreateBuffer flags
          }
          else
          {
              // TODO: change the flag to read-only
-            d_atdat->atom_types = clCreateBuffer(nb->dev_info->context, CL_MEM_READ_WRITE, nalloc * sizeof(int), NULL, &cl_error);
+            d_atdat->atom_types = clCreateBuffer(nb->dev_rundata->context, CL_MEM_READ_WRITE, nalloc * sizeof(int), NULL, &cl_error);
              assert(CL_SUCCESS == cl_error);
              // TODO: handle errors, check clCreateBuffer flags
          }
@@ -1029,6 +1035,38 @@ void free_kernels(cl_kernel *kernels, int count)
      }
  }
  
+/*! \brief Free the OpenCL runtime data (context and program).
+ *
+ *  The function releases the OpenCL context and program assuciated with the
+ *  device that the calling PP rank is running on.
+ *
+ *  \param runData [in]  porinter to the structure with runtime data.
+ */
+static void free_gpu_device_runtime_data(gmx_device_runtime_data_t *runData)
+{
+    if (runData == NULL)
+    {
+        return;
+    }
+
+    cl_int gmx_unused cl_error;
+
+    if (runData->context)
+    {
+        cl_error         = clReleaseContext(runData->context);
+        runData->context = NULL;
+        assert(CL_SUCCESS == cl_error);
+    }
+
+    if (runData->program)
+    {
+        cl_error         = clReleaseProgram(runData->program);
+        runData->program = NULL;
+        assert(CL_SUCCESS == cl_error);
+    }
+
+}
+
  //! This function is documented in the header file
  void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
  {
@@ -1115,6 +1153,9 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
          nb->misc_ops_and_local_H2D_done = NULL;
      }
  
+    free_gpu_device_runtime_data(nb->dev_rundata);
+    sfree(nb->dev_rundata);
+
      /* Free timers and timings */
      sfree(nb->timers);
      sfree(nb->timings);
@@ -1126,6 +1167,7 @@ void nbnxn_gpu_free(gmx_nbnxn_ocl_t *nb)
      }
  }
  
+
  //! This function is documented in the header file
  gmx_wallclock_gpu_t * nbnxn_gpu_get_timings(gmx_nbnxn_ocl_t *nb)
  {
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp

index 3b9785d85bb8a935f9ef6818e739dd9044f1f8eb..7809f4be889f22d85448734c3806287a8af6c0ec 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp
@@ -192,7 +192,7 @@ nbnxn_gpu_compile_kernels(gmx_nbnxn_ocl_t *nb)
      }
  
      device_id        = nb->dev_info->ocl_gpu_id.ocl_device_id;
-    context          = nb->dev_info->context;
+    context          = nb->dev_rundata->context;
  
      /* Here we pass macros and static const int variables defined in include
       * files outside the nbnxn_ocl as macros, to avoid including those files
@@ -237,5 +237,5 @@ nbnxn_gpu_compile_kernels(gmx_nbnxn_ocl_t *nb)
      }
      GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
  
-    nb->dev_info->program = program;
+    nb->dev_rundata->program = program;
  }
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h

index e155bec4ea5abb5e7b2aaf32e9c39763147d9bb3..0fa212584e7641ac0200981124aa1782c19b061c 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h
@@ -271,7 +271,8 @@ typedef struct cl_timers
   */
  struct gmx_nbnxn_ocl_t
  {
-    struct gmx_device_info_t *dev_info;        /**< OpenCL device information                                  */
+    struct gmx_device_info_t         *dev_info;    /**< OpenCL device information                              */
+    struct gmx_device_runtime_data_t *dev_rundata; /**< OpenCL runtime data (context, kernels)                 */
  
      /**< Pointers to non-bonded kernel functions
       * organized similar with nb_kfunc_xxx arrays in nbnxn_ocl.cpp */
author	Szilárd Páll <pall.szilard@gmail.com>
	Fri, 18 Mar 2016 01:12:18 +0000 (02:12 +0100)
committer	Szilárd Páll <pall.szilard@gmail.com>
	Thu, 31 Mar 2016 00:05:24 +0000 (02:05 +0200)
docs/user-guide/mdrun-performance.rst		patch \| blob \| history
src/gromacs/gpu_utils/cudautils.cuh		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils_ocl.cpp		patch \| blob \| history
src/gromacs/gpu_utils/oclutils.h		patch \| blob \| history
src/gromacs/hardware/detecthardware.cpp		patch \| blob \| history
src/gromacs/mdlib/forcerec.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_data_mgmt.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_jit_support.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_types.h		patch \| blob \| history