Small improvement in OpenCL error handling

author Szilárd Páll <pall.szilard@gmail.com>

Wed, 15 Jan 2020 14:04:51 +0000 (15:04 +0100)

committer Paul Bauer <paul.bauer.q@gmail.com>

Wed, 29 Jan 2020 17:00:28 +0000 (18:00 +0100)
author Szilárd Páll <pall.szilard@gmail.com>
Wed, 15 Jan 2020 14:04:51 +0000 (15:04 +0100)
committer Paul Bauer <paul.bauer.q@gmail.com>
Wed, 29 Jan 2020 17:00:28 +0000 (18:00 +0100)
diff --git a/src/gromacs/gpu_utils/oclutils.cpp b/src/gromacs/gpu_utils/oclutils.cpp

index 5d9630b45009efa2182f373d5097a125d0d5b411..f987ae00e78fb35e8aa285a702eca55a40585657 100644 (file)
--- a/src/gromacs/gpu_utils/oclutils.cpp
+++ b/src/gromacs/gpu_utils/oclutils.cpp
@@ -73,19 +73,17 @@ int ocl_copy_H2D(cl_mem             d_dest,
          case GpuApiCallBehavior::Async:
              cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_FALSE, offset, bytes, h_src,
                                              0, nullptr, copy_event);
-            assert(cl_error == CL_SUCCESS);
-            // TODO: handle errors
              break;
  
          case GpuApiCallBehavior::Sync:
              cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_TRUE, offset, bytes, h_src, 0,
                                              nullptr, copy_event);
-            assert(cl_error == CL_SUCCESS);
-            // TODO: handle errors
              break;
  
          default: throw;
      }
+    GMX_ASSERT(cl_error == CL_SUCCESS,
+               ("clEnqueueWriteBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
  
      return 0;
  }
@@ -133,19 +131,18 @@ int ocl_copy_D2H(void*              h_dest,
          case GpuApiCallBehavior::Async:
              cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_FALSE, offset, bytes, h_dest, 0,
                                             nullptr, copy_event);
-            assert(cl_error == CL_SUCCESS);
-            // TODO: handle errors
              break;
  
          case GpuApiCallBehavior::Sync:
              cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_TRUE, offset, bytes, h_dest, 0,
                                             nullptr, copy_event);
-            assert(cl_error == CL_SUCCESS);
-            // TODO: handle errors
              break;
  
          default: throw;
      }
+    GMX_ASSERT(cl_error == CL_SUCCESS,
+               ("clEnqueueWriteBuffer failed: " + ocl_get_error_string(cl_error)).c_str());
+
  
      return 0;
  }
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp

index d0e693f85facf028c575bd7d82e52c723777ccdf..12d38ed2677adf97a65a55a3977e13d087d5db34 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -108,7 +108,7 @@ static inline void validate_global_work_size(const KernelLaunchConfig& config,
      cl_uint device_size_t_size_bits;
      cl_uint host_size_t_size_bits;
  
-    assert(dinfo);
+    GMX_ASSERT(dinfo, "Need a valid device info object");
  
      size_t global_work_size[3];
      GMX_ASSERT(work_dim <= 3, "Not supporting hyper-grids just yet");
@@ -346,8 +346,10 @@ static inline cl_kernel select_nbnxn_kernel(NbnxmGpu* nb, int eeltype, int evdwt
      cl_kernel*  kernel_ptr;
      cl_int      cl_error;
  
-    assert(eeltype < eelOclNR);
-    assert(evdwtype < evdwOclNR);
+    GMX_ASSERT(eeltype < eelOclNR,
+               "The electrostatics type requested is not implemented in the OpenCL kernels.");
+    GMX_ASSERT(evdwtype < evdwOclNR,
+               "The VdW type requested is not implemented in the OpenCL kernels.");
  
      if (bDoEne)
      {
@@ -379,9 +381,9 @@ static inline cl_kernel select_nbnxn_kernel(NbnxmGpu* nb, int eeltype, int evdwt
      if (nullptr == kernel_ptr[0])
      {
          *kernel_ptr = clCreateKernel(nb->dev_rundata->program, kernel_name_to_run, &cl_error);
-        assert(cl_error == CL_SUCCESS);
+        GMX_ASSERT(cl_error == CL_SUCCESS,
+                   ("clCreateKernel failed: " + ocl_get_error_string(cl_error)).c_str());
      }
-    // TODO: handle errors
  
      return *kernel_ptr;
  }
@@ -466,7 +468,8 @@ static void sync_ocl_event(cl_command_queue stream, cl_event* ocl_event)
  
      /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
      cl_error = clReleaseEvent(*ocl_event);
-    assert(CL_SUCCESS == cl_error);
+    GMX_ASSERT(cl_error == CL_SUCCESS,
+               ("clReleaseEvent failed: " + ocl_get_error_string(cl_error)).c_str());
      *ocl_event = nullptr;
  }
  
@@ -540,14 +543,16 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
          {
              cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(
                      stream, 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
-            assert(CL_SUCCESS == cl_error);
+            GMX_ASSERT(cl_error == CL_SUCCESS,
+                       ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
  
              /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
               * in the local stream in order to be able to sync with the above event
               * from the non-local stream.
               */
              cl_error = clFlush(stream);
-            assert(CL_SUCCESS == cl_error);
+            GMX_ASSERT(cl_error == CL_SUCCESS,
+                       ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
          }
          else
          {
@@ -897,7 +902,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
  
      /* kick off work */
      cl_error = clFlush(stream);
-    assert(CL_SUCCESS == cl_error);
+    GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
  
      /* After the non-local D2H is launched the nonlocal_done event can be
         recorded which signals that the local D2H can proceed. This event is not
@@ -906,7 +911,8 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
      if (iloc == InteractionLocality::NonLocal)
      {
          cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->nonlocal_done));
-        assert(CL_SUCCESS == cl_error);
+        GMX_ASSERT(cl_error == CL_SUCCESS,
+                   ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
          nb->bNonLocalStreamActive = CL_TRUE;
      }
  
@@ -960,8 +966,6 @@ int nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic)
      /* OpenCL: By default, use analytical Ewald
       * TODO: tabulated does not work, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp
       *
-     * TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
-     *
       */
      /* By default use analytical Ewald. */
      bUseAnalyticalEwald = true;
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp

index fe53ddf9c4a6d5f86fc832fabb7eee8f0c3b9aaf..e7b2884f1a0032dad014e57761e8946bb31d106b 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -491,8 +491,8 @@ static void nbnxn_gpu_create_context(gmx_device_runtime_data_t* runtimeData,
      cl_context            context;
      cl_int                cl_error;
  
-    assert(runtimeData != nullptr);
-    assert(devInfo != nullptr);
+    GMX_ASSERT(runtimeData, "Need a valid runtimeData object");
+    GMX_ASSERT(devInfo, "Need a valid device info object");
  
      platform_id = devInfo->ocl_gpu_id.ocl_platform_id;
      device_id   = devInfo->ocl_gpu_id.ocl_device_id;
@@ -619,7 +619,7 @@ NbnxmGpu* gpu_init(const gmx_device_info_t*   deviceInfo,
      cl_int                      cl_error;
      cl_command_queue_properties queue_properties;
  
-    assert(ic);
+    GMX_ASSERT(ic, "Need a valid interaction constants object");
  
      auto nb = new NbnxmGpu;
      snew(nb->atdat, 1);
@@ -752,7 +752,7 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
      /* kick off buffer clearing kernel to ensure concurrency with constraints/update */
      cl_int gmx_unused cl_error;
      cl_error = clFlush(nb->stream[InteractionLocality::Local]);
-    assert(CL_SUCCESS == cl_error);
+    GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
  }
  
  //! This function is documented in the header file
@@ -933,7 +933,7 @@ static void free_kernel(cl_kernel* kernel_ptr)
  {
      cl_int gmx_unused cl_error;
  
-    assert(nullptr != kernel_ptr);
+    GMX_ASSERT(kernel_ptr, "Need a valid kernel pointer");
  
      if (*kernel_ptr)
      {
author	Szilárd Páll <pall.szilard@gmail.com>
	Wed, 15 Jan 2020 14:04:51 +0000 (15:04 +0100)
committer	Paul Bauer <paul.bauer.q@gmail.com>
	Wed, 29 Jan 2020 17:00:28 +0000 (18:00 +0100)
src/gromacs/gpu_utils/oclutils.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp		patch \| blob \| history