From d15990b9f8138a506022e77aaa4a3ea409087025 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Wed, 15 Jan 2020 15:04:51 +0100 Subject: [PATCH] Small improvement in OpenCL error handling - replace plain asserts with GMX_ASSERT - improve error message prints by decorating with OpenCL error strings Refs #3300 Change-Id: Ibac66ee0de35c572c0051d8c394a488051b23a7e --- src/gromacs/gpu_utils/oclutils.cpp | 13 ++++----- src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp | 28 +++++++++++-------- .../nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp | 10 +++---- 3 files changed, 26 insertions(+), 25 deletions(-) diff --git a/src/gromacs/gpu_utils/oclutils.cpp b/src/gromacs/gpu_utils/oclutils.cpp index 5d9630b450..f987ae00e7 100644 --- a/src/gromacs/gpu_utils/oclutils.cpp +++ b/src/gromacs/gpu_utils/oclutils.cpp @@ -73,19 +73,17 @@ int ocl_copy_H2D(cl_mem d_dest, case GpuApiCallBehavior::Async: cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_FALSE, offset, bytes, h_src, 0, nullptr, copy_event); - assert(cl_error == CL_SUCCESS); - // TODO: handle errors break; case GpuApiCallBehavior::Sync: cl_error = clEnqueueWriteBuffer(command_queue, d_dest, CL_TRUE, offset, bytes, h_src, 0, nullptr, copy_event); - assert(cl_error == CL_SUCCESS); - // TODO: handle errors break; default: throw; } + GMX_ASSERT(cl_error == CL_SUCCESS, + ("clEnqueueWriteBuffer failed: " + ocl_get_error_string(cl_error)).c_str()); return 0; } @@ -133,19 +131,18 @@ int ocl_copy_D2H(void* h_dest, case GpuApiCallBehavior::Async: cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_FALSE, offset, bytes, h_dest, 0, nullptr, copy_event); - assert(cl_error == CL_SUCCESS); - // TODO: handle errors break; case GpuApiCallBehavior::Sync: cl_error = clEnqueueReadBuffer(command_queue, d_src, CL_TRUE, offset, bytes, h_dest, 0, nullptr, copy_event); - assert(cl_error == CL_SUCCESS); - // TODO: handle errors break; default: throw; } + GMX_ASSERT(cl_error == CL_SUCCESS, + ("clEnqueueWriteBuffer failed: " + ocl_get_error_string(cl_error)).c_str()); + return 0; } diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index d0e693f85f..12d38ed267 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -108,7 +108,7 @@ static inline void validate_global_work_size(const KernelLaunchConfig& config, cl_uint device_size_t_size_bits; cl_uint host_size_t_size_bits; - assert(dinfo); + GMX_ASSERT(dinfo, "Need a valid device info object"); size_t global_work_size[3]; GMX_ASSERT(work_dim <= 3, "Not supporting hyper-grids just yet"); @@ -346,8 +346,10 @@ static inline cl_kernel select_nbnxn_kernel(NbnxmGpu* nb, int eeltype, int evdwt cl_kernel* kernel_ptr; cl_int cl_error; - assert(eeltype < eelOclNR); - assert(evdwtype < evdwOclNR); + GMX_ASSERT(eeltype < eelOclNR, + "The electrostatics type requested is not implemented in the OpenCL kernels."); + GMX_ASSERT(evdwtype < evdwOclNR, + "The VdW type requested is not implemented in the OpenCL kernels."); if (bDoEne) { @@ -379,9 +381,9 @@ static inline cl_kernel select_nbnxn_kernel(NbnxmGpu* nb, int eeltype, int evdwt if (nullptr == kernel_ptr[0]) { *kernel_ptr = clCreateKernel(nb->dev_rundata->program, kernel_name_to_run, &cl_error); - assert(cl_error == CL_SUCCESS); + GMX_ASSERT(cl_error == CL_SUCCESS, + ("clCreateKernel failed: " + ocl_get_error_string(cl_error)).c_str()); } - // TODO: handle errors return *kernel_ptr; } @@ -466,7 +468,8 @@ static void sync_ocl_event(cl_command_queue stream, cl_event* ocl_event) /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */ cl_error = clReleaseEvent(*ocl_event); - assert(CL_SUCCESS == cl_error); + GMX_ASSERT(cl_error == CL_SUCCESS, + ("clReleaseEvent failed: " + ocl_get_error_string(cl_error)).c_str()); *ocl_event = nullptr; } @@ -540,14 +543,16 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom { cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList( stream, 0, nullptr, &(nb->misc_ops_and_local_H2D_done)); - assert(CL_SUCCESS == cl_error); + GMX_ASSERT(cl_error == CL_SUCCESS, + ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str()); /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed * in the local stream in order to be able to sync with the above event * from the non-local stream. */ cl_error = clFlush(stream); - assert(CL_SUCCESS == cl_error); + GMX_ASSERT(cl_error == CL_SUCCESS, + ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str()); } else { @@ -897,7 +902,7 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* kick off work */ cl_error = clFlush(stream); - assert(CL_SUCCESS == cl_error); + GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str()); /* After the non-local D2H is launched the nonlocal_done event can be recorded which signals that the local D2H can proceed. This event is not @@ -906,7 +911,8 @@ void gpu_launch_cpyback(NbnxmGpu* nb, if (iloc == InteractionLocality::NonLocal) { cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->nonlocal_done)); - assert(CL_SUCCESS == cl_error); + GMX_ASSERT(cl_error == CL_SUCCESS, + ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str()); nb->bNonLocalStreamActive = CL_TRUE; } @@ -960,8 +966,6 @@ int nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic) /* OpenCL: By default, use analytical Ewald * TODO: tabulated does not work, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp * - * TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices. - * */ /* By default use analytical Ewald. */ bUseAnalyticalEwald = true; diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp index fe53ddf9c4..e7b2884f1a 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp @@ -491,8 +491,8 @@ static void nbnxn_gpu_create_context(gmx_device_runtime_data_t* runtimeData, cl_context context; cl_int cl_error; - assert(runtimeData != nullptr); - assert(devInfo != nullptr); + GMX_ASSERT(runtimeData, "Need a valid runtimeData object"); + GMX_ASSERT(devInfo, "Need a valid device info object"); platform_id = devInfo->ocl_gpu_id.ocl_platform_id; device_id = devInfo->ocl_gpu_id.ocl_device_id; @@ -619,7 +619,7 @@ NbnxmGpu* gpu_init(const gmx_device_info_t* deviceInfo, cl_int cl_error; cl_command_queue_properties queue_properties; - assert(ic); + GMX_ASSERT(ic, "Need a valid interaction constants object"); auto nb = new NbnxmGpu; snew(nb->atdat, 1); @@ -752,7 +752,7 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial) /* kick off buffer clearing kernel to ensure concurrency with constraints/update */ cl_int gmx_unused cl_error; cl_error = clFlush(nb->stream[InteractionLocality::Local]); - assert(CL_SUCCESS == cl_error); + GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str()); } //! This function is documented in the header file @@ -933,7 +933,7 @@ static void free_kernel(cl_kernel* kernel_ptr) { cl_int gmx_unused cl_error; - assert(nullptr != kernel_ptr); + GMX_ASSERT(kernel_ptr, "Need a valid kernel pointer"); if (*kernel_ptr) { -- 2.22.0