From e3d904d007afc9545f95d72da51c7dc8d83dae54 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Thu, 20 Feb 2020 15:27:19 +0100 Subject: [PATCH] Use DeviceStream init(...) function to create streams Change the stream creation procedures from direct calls to CUDA and OpenCL API to using pre-defined init(...) method of the DeviceStream class. Refs #3314 Refs #3311 Change-Id: I96a0ca41f251b9925ef9bed77c4f355939b65c6d --- src/gromacs/ewald/pme_gpu_internal.cpp | 30 ++------------- src/gromacs/ewald/pme_pp_comm_gpu_impl.cu | 7 ++-- .../state_propagator_data_gpu_impl_gpu.cpp | 8 ++-- .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 20 +++------- src/gromacs/nbnxm/gpu_data_mgmt.h | 1 - src/gromacs/nbnxm/nbnxm_setup.cpp | 2 +- .../nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp | 38 ++----------------- 7 files changed, 21 insertions(+), 85 deletions(-) diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index 822109de4c..ae308bf570 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -524,32 +524,8 @@ void pme_gpu_init_internal(PmeGpu* pmeGpu) * - default high priority with CUDA * - no priorities implemented yet with OpenCL; see #2532 */ -#if GMX_GPU == GMX_GPU_CUDA - cudaError_t stat; - int highest_priority, lowest_priority; - stat = cudaDeviceGetStreamPriorityRange(&lowest_priority, &highest_priority); - CU_RET_ERR(stat, "PME cudaDeviceGetStreamPriorityRange failed"); - cudaStream_t stream; - stat = cudaStreamCreateWithPriority(&stream, - cudaStreamDefault, // cudaStreamNonBlocking, - highest_priority); - pmeGpu->archSpecific->pmeStream_.setStream(stream); - CU_RET_ERR(stat, "cudaStreamCreateWithPriority on the PME stream failed"); -#elif GMX_GPU == GMX_GPU_OPENCL - cl_command_queue_properties queueProperties = - pmeGpu->archSpecific->useTiming ? CL_QUEUE_PROFILING_ENABLE : 0; - cl_device_id device_id = pmeGpu->deviceInfo->oclDeviceId; - cl_int clError; - pmeGpu->archSpecific->pmeStream_.setStream(clCreateCommandQueue( - pmeGpu->archSpecific->deviceContext_.context(), device_id, queueProperties, &clError)); - - - if (clError != CL_SUCCESS) - { - GMX_THROW(gmx::InternalError( - gmx::formatString("Failed to create PME command queue (OpenCL error %d)", clError).c_str())); - } -#endif + pmeGpu->archSpecific->pmeStream_.init(*pmeGpu->deviceInfo, pmeGpu->archSpecific->deviceContext_, + DeviceStreamPriority::High, pmeGpu->archSpecific->useTiming); } void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu) @@ -812,6 +788,8 @@ static void pme_gpu_select_best_performing_pme_spreadgather_kernels(PmeGpu* pmeG */ static void pme_gpu_init(gmx_pme_t* pme, const DeviceInformation* deviceInfo, const PmeGpuProgram* pmeGpuProgram) { + GMX_ASSERT(deviceInfo != nullptr, + "Device information can not be nullptr when GPU is used for PME."); pme->gpu = new PmeGpu(); PmeGpu* pmeGpu = pme->gpu; changePinningPolicy(&pmeGpu->staging.h_forces, pme_get_pinning_policy()); diff --git a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu index f5aac2981a..32a752746a 100644 --- a/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu +++ b/src/gromacs/ewald/pme_pp_comm_gpu_impl.cu @@ -64,9 +64,10 @@ PmePpCommGpu::Impl::Impl(MPI_Comm comm, int pmeRank, const DeviceContext& device GMX_RELEASE_ASSERT( GMX_THREAD_MPI, "PME-PP GPU Communication is currently only supported with thread-MPI enabled"); - cudaStream_t stream; - cudaStreamCreate(&stream); - pmePpCommStream_.setStream(stream); + + // In CUDA we only need priority to create stream. + // (note that this will be moved from here in the follow-up patch) + pmePpCommStream_.init(DeviceInformation(), DeviceContext(), DeviceStreamPriority::Normal, false); } PmePpCommGpu::Impl::~Impl() = default; diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp index b1fefd34a5..d0027852ee 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -109,12 +109,10 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStream* pmeStream, // TODO: The update stream should be created only when it is needed. # if (GMX_GPU == GMX_GPU_CUDA) - cudaError_t stat; - cudaStream_t stream; - stat = cudaStreamCreate(&stream); - updateStreamOwn_.setStream(stream); + // In CUDA we only need priority to create stream. + // (note that this will be moved from here in the follow-up patch) + updateStreamOwn_.init(DeviceInformation(), DeviceContext(), DeviceStreamPriority::Normal, false); updateStream_ = &updateStreamOwn_; - CU_RET_ERR(stat, "CUDA stream creation failed in StatePropagatorDataGpu"); # endif } diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index cbd5f8ce01..36342b935f 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -418,8 +418,7 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo, const interaction_const_t* ic, const PairlistParams& listParams, const nbnxn_atomdata_t* nbat, - int /*rank*/, - bool bLocalAndNonlocal) + bool bLocalAndNonlocal) { cudaError_t stat; @@ -448,10 +447,8 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo, nb->deviceInfo = deviceInfo; /* local/non-local GPU streams */ - cudaStream_t localStream; - stat = cudaStreamCreate(&localStream); - nb->deviceStreams[InteractionLocality::Local].setStream(localStream); - CU_RET_ERR(stat, "cudaStreamCreate on stream[InterationLocality::Local] failed"); + nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceInfo, DeviceContext(), + DeviceStreamPriority::Normal, nb->bDoTime); if (nb->bUseTwoStreams) { init_plist(nb->plist[InteractionLocality::NonLocal]); @@ -460,15 +457,8 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo, * priorities, because we are querying the priority range which in this * case will be a single value. */ - int highest_priority; - stat = cudaDeviceGetStreamPriorityRange(nullptr, &highest_priority); - CU_RET_ERR(stat, "cudaDeviceGetStreamPriorityRange failed"); - - cudaStream_t nonLocalStream; - stat = cudaStreamCreateWithPriority(&nonLocalStream, cudaStreamDefault, highest_priority); - nb->deviceStreams[InteractionLocality::NonLocal].setStream(nonLocalStream); - CU_RET_ERR(stat, - "cudaStreamCreateWithPriority on stream[InteractionLocality::NonLocal] failed"); + nb->deviceStreams[InteractionLocality::NonLocal].init( + *nb->deviceInfo, DeviceContext(), DeviceStreamPriority::High, nb->bDoTime); } /* init events for sychronization (timing disabled for performance reasons!) */ diff --git a/src/gromacs/nbnxm/gpu_data_mgmt.h b/src/gromacs/nbnxm/gpu_data_mgmt.h index e242771862..574588b39a 100644 --- a/src/gromacs/nbnxm/gpu_data_mgmt.h +++ b/src/gromacs/nbnxm/gpu_data_mgmt.h @@ -72,7 +72,6 @@ NbnxmGpu* gpu_init(const DeviceInformation gmx_unused* deviceInfo, const interaction_const_t gmx_unused* ic, const PairlistParams gmx_unused& listParams, const nbnxn_atomdata_t gmx_unused* nbat, - int gmx_unused rank, /* true if both local and non-local are done on GPU */ bool gmx_unused bLocalAndNonlocal) GPU_FUNC_TERM_WITH_RETURN(nullptr); diff --git a/src/gromacs/nbnxm/nbnxm_setup.cpp b/src/gromacs/nbnxm/nbnxm_setup.cpp index f7c7f6dd16..d854ede572 100644 --- a/src/gromacs/nbnxm/nbnxm_setup.cpp +++ b/src/gromacs/nbnxm/nbnxm_setup.cpp @@ -452,7 +452,7 @@ std::unique_ptr init_nb_verlet(const gmx::MDLogger& mdlo /* init the NxN GPU data; the last argument tells whether we'll have * both local and non-local NB calculation on GPU */ gpu_nbv = gpu_init(deviceInfo, *deviceContext, fr->ic, pairlistParams, nbat.get(), - cr->nodeid, haveMultipleDomains); + haveMultipleDomains); minimumIlistCountForGpuBalancing = getMinimumIlistCountForGpuBalancing(gpu_nbv); } diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp index fa37263a5b..adc60b0a39 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp @@ -560,12 +560,8 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo, const interaction_const_t* ic, const PairlistParams& listParams, const nbnxn_atomdata_t* nbat, - const int rank, const bool bLocalAndNonlocal) { - cl_int cl_error; - cl_command_queue_properties queue_properties; - GMX_ASSERT(ic, "Need a valid interaction constants object"); auto nb = new NbnxmGpu; @@ -596,42 +592,16 @@ NbnxmGpu* gpu_init(const DeviceInformation* deviceInfo, /* OpenCL timing disabled if GMX_DISABLE_GPU_TIMING is defined. */ nb->bDoTime = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr); - /* Create queues only after bDoTime has been initialized */ - if (nb->bDoTime) - { - queue_properties = CL_QUEUE_PROFILING_ENABLE; - } - else - { - queue_properties = 0; - } - - cl_command_queue localStream = - clCreateCommandQueue(nb->dev_rundata->deviceContext_.context(), - nb->deviceInfo->oclDeviceId, queue_properties, &cl_error); /* local/non-local GPU streams */ - nb->deviceStreams[InteractionLocality::Local].setStream(localStream); - - if (CL_SUCCESS != cl_error) - { - gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d", rank, - nb->deviceInfo->device_name, cl_error); - } + nb->deviceStreams[InteractionLocality::Local].init(*nb->deviceInfo, nb->dev_rundata->deviceContext_, + DeviceStreamPriority::Normal, nb->bDoTime); if (nb->bUseTwoStreams) { init_plist(nb->plist[InteractionLocality::NonLocal]); - cl_command_queue nonLocalStream = - clCreateCommandQueue(nb->dev_rundata->deviceContext_.context(), - nb->deviceInfo->oclDeviceId, queue_properties, &cl_error); - nb->deviceStreams[InteractionLocality::NonLocal].setStream(nonLocalStream); - - if (CL_SUCCESS != cl_error) - { - gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d", - rank, nb->deviceInfo->device_name, cl_error); - } + nb->deviceStreams[InteractionLocality::NonLocal].init( + *nb->deviceInfo, nb->dev_rundata->deviceContext_, DeviceStreamPriority::High, nb->bDoTime); } if (nb->bDoTime) -- 2.22.0