From 99c4197e24caf1e7d58d79553c59387936057cde Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Thu, 20 Feb 2020 17:13:23 +0100 Subject: [PATCH] Remove stream from GPU kernel launch config The stream is not a configuration parameter, hence it should not be a part of a kernel config struct. This also remove dependency of kernel config on DeviceStream. Change-Id: Idb0426f21bc9db1490053b82f1d11bee55ae9133 --- src/gromacs/domdec/gpuhaloexchange_impl.cu | 8 ++++---- src/gromacs/ewald/pme_gpu_internal.cpp | 12 ++++++------ src/gromacs/gpu_utils/cudautils.cuh | 4 +++- src/gromacs/gpu_utils/gpu_utils.cu | 11 ++++++++--- src/gromacs/gpu_utils/gputraits.cuh | 10 ++++++---- src/gromacs/gpu_utils/gputraits_ocl.h | 10 ++++++---- src/gromacs/gpu_utils/oclutils.h | 8 +++++--- src/gromacs/gpu_utils/tests/typecasts_runner.cu | 4 ++-- src/gromacs/listed_forces/gpubondedkernels.cu | 4 ++-- src/gromacs/mdlib/leapfrog_gpu.cu | 3 +-- src/gromacs/mdlib/lincs_gpu.cu | 4 ++-- src/gromacs/mdlib/settle_gpu.cu | 4 ++-- src/gromacs/mdlib/update_constrain_gpu_impl.cu | 5 ++--- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 12 ++++-------- src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp | 8 +++----- 15 files changed, 56 insertions(+), 51 deletions(-) diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu index 0829800111..6c241f3fd5 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cu +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu @@ -214,7 +214,6 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box config.gridSize[1] = 1; config.gridSize[2] = 1; config.sharedMemorySize = 0; - config.stream = nonLocalStream_.stream(); const float3* sendBuf = d_sendBuf_; const float3* d_x = d_x_; @@ -240,7 +239,8 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &sendBuf, &d_x, &indexMap, &size, &coordinateShift); - launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs); + launchGpuKernel(kernelFn, config, nonLocalStream_, nullptr, + "Domdec GPU Apply X Halo Exchange", kernelArgs); } communicateHaloData(d_x_, HaloQuantity::HaloCoordinates, coordinatesReadyOnDeviceEvent); @@ -285,7 +285,6 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) config.gridSize[1] = 1; config.gridSize[2] = 1; config.sharedMemorySize = 0; - config.stream = nonLocalStream_.stream(); const float3* recvBuf = d_recvBuf_; const int* indexMap = d_indexMap_; @@ -306,7 +305,8 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &d_f, &recvBuf, &indexMap, &size); - launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs); + launchGpuKernel(kernelFn, config, nonLocalStream_, nullptr, + "Domdec GPU Apply F Halo Exchange", kernelArgs); } if (pulse_ == 0) diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index 8c61eac86d..54c3f54624 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -1163,7 +1163,6 @@ void pme_gpu_spread(const PmeGpu* pmeGpu, config.blockSize[2] = atomsPerBlock; config.gridSize[0] = dimGrid.first; config.gridSize[1] = dimGrid.second; - config.stream = pmeGpu->archSpecific->pmeStream_.stream(); int timingId; PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr; @@ -1203,7 +1202,8 @@ void pme_gpu_spread(const PmeGpu* pmeGpu, &kernelParamsPtr->atoms.d_coordinates); #endif - launchGpuKernel(kernelPtr, config, timingEvent, "PME spline/spread", kernelArgs); + launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, + "PME spline/spread", kernelArgs); pme_gpu_stop_timing(pmeGpu, timingId); const auto& settings = pmeGpu->settings; @@ -1281,7 +1281,6 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd config.gridSize[1] = (pmeGpu->kernelParams->grid.complexGridSize[middleDim] + gridLinesPerBlock - 1) / gridLinesPerBlock; config.gridSize[2] = pmeGpu->kernelParams->grid.complexGridSize[majorDim]; - config.stream = pmeGpu->archSpecific->pmeStream_.stream(); int timingId = gtPME_SOLVE; PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr; @@ -1305,7 +1304,8 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->grid.d_splineModuli, &kernelParamsPtr->constants.d_virialAndEnergy, &kernelParamsPtr->grid.d_fourierGrid); #endif - launchGpuKernel(kernelPtr, config, timingEvent, "PME solve", kernelArgs); + launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME solve", + kernelArgs); pme_gpu_stop_timing(pmeGpu, timingId); if (computeEnergyAndVirial) @@ -1403,7 +1403,6 @@ void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid) config.blockSize[2] = atomsPerBlock; config.gridSize[0] = dimGrid.first; config.gridSize[1] = dimGrid.second; - config.stream = pmeGpu->archSpecific->pmeStream_.stream(); // TODO test different cache configs @@ -1424,7 +1423,8 @@ void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid) &kernelParamsPtr->atoms.d_dtheta, &kernelParamsPtr->atoms.d_gridlineIndices, &kernelParamsPtr->atoms.d_forces); #endif - launchGpuKernel(kernelPtr, config, timingEvent, "PME gather", kernelArgs); + launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME gather", + kernelArgs); pme_gpu_stop_timing(pmeGpu, timingId); if (pmeGpu->settings.useGpuForceReduction) diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh index 48212bf3bc..ff07d174c2 100644 --- a/src/gromacs/gpu_utils/cudautils.cuh +++ b/src/gromacs/gpu_utils/cudautils.cuh @@ -313,6 +313,7 @@ std::array prepareGpuKernelArguments(KernelPtr kernel, * \tparam Args Types of all the kernel arguments * \param[in] kernel Kernel function handle * \param[in] config Kernel configuration for launching + * \param[in] deviceStream GPU stream to launch kernel in * \param[in] kernelName Human readable kernel description, for error handling only * \param[in] kernelArgs Array of the pointers to the kernel arguments, prepared by * prepareGpuKernelArguments() \throws gmx::InternalError on kernel launch failure @@ -320,6 +321,7 @@ std::array prepareGpuKernelArguments(KernelPtr kernel, template void launchGpuKernel(void (*kernel)(Args...), const KernelLaunchConfig& config, + const DeviceStream& deviceStream, CommandEvent* /*timingEvent */, const char* kernelName, const std::array& kernelArgs) @@ -327,7 +329,7 @@ void launchGpuKernel(void (*kernel)(Args...), dim3 blockSize(config.blockSize[0], config.blockSize[1], config.blockSize[2]); dim3 gridSize(config.gridSize[0], config.gridSize[1], config.gridSize[2]); cudaLaunchKernel((void*)kernel, gridSize, blockSize, const_cast(kernelArgs.data()), - config.sharedMemorySize, config.stream); + config.sharedMemorySize, deviceStream.stream()); cudaError_t status = cudaGetLastError(); if (cudaSuccess != status) diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu index 83232bb93c..1fcbdb2423 100644 --- a/src/gromacs/gpu_utils/gpu_utils.cu +++ b/src/gromacs/gpu_utils/gpu_utils.cu @@ -50,6 +50,8 @@ #include #include "gromacs/gpu_utils/cudautils.cuh" +#include "gromacs/gpu_utils/device_context.h" +#include "gromacs/gpu_utils/device_stream.h" #include "gromacs/gpu_utils/pmalloc_cuda.h" #include "gromacs/hardware/gpu_hw_info.h" #include "gromacs/utility/basedefinitions.h" @@ -214,9 +216,12 @@ static int do_sanity_checks(int dev_id, const cudaDeviceProp& dev_prop) try { KernelLaunchConfig config; - config.blockSize[0] = 512; - const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config); - launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments); + config.blockSize[0] = 512; + const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config); + DeviceInformation deviceInfo; + const DeviceContext deviceContext(deviceInfo); + const DeviceStream deviceStream(deviceInfo, deviceContext, DeviceStreamPriority::Normal, false); + launchGpuKernel(k_dummy_test, config, deviceStream, nullptr, "Dummy kernel", dummyArguments); } catch (gmx::GromacsException& ex) { diff --git a/src/gromacs/gpu_utils/gputraits.cuh b/src/gromacs/gpu_utils/gputraits.cuh index ec3424a8f4..b3d1575bec 100644 --- a/src/gromacs/gpu_utils/gputraits.cuh +++ b/src/gromacs/gpu_utils/gputraits.cuh @@ -73,10 +73,12 @@ using CommandEvent = void; */ struct KernelLaunchConfig { - size_t gridSize[3] = { 1, 1, 1 }; //!< Block counts - size_t blockSize[3] = { 1, 1, 1 }; //!< Per-block thread counts - size_t sharedMemorySize = 0; //!< Shared memory size in bytes - cudaStream_t stream = nullptr; //!< Stream to launch kernel in + //! Block counts + size_t gridSize[3] = { 1, 1, 1 }; + //! Per-block thread counts + size_t blockSize[3] = { 1, 1, 1 }; + //! Shared memory size in bytes + size_t sharedMemorySize = 0; }; //! Sets whether device code can use arrays that are embedded in structs. diff --git a/src/gromacs/gpu_utils/gputraits_ocl.h b/src/gromacs/gpu_utils/gputraits_ocl.h index 0438c084d1..c8219e5c78 100644 --- a/src/gromacs/gpu_utils/gputraits_ocl.h +++ b/src/gromacs/gpu_utils/gputraits_ocl.h @@ -89,10 +89,12 @@ using CommandEvent = cl_event; */ struct KernelLaunchConfig { - size_t gridSize[3] = { 1, 1, 1 }; //!< Work groups (CUDA blocks) counts - size_t blockSize[3] = { 1, 1, 1 }; //!< Per work group (CUDA block) thread counts - size_t sharedMemorySize = 0; //!< Shared memory size in bytes - cl_command_queue stream = nullptr; //!< Stream to launch kernel in + //! Work groups (CUDA blocks) counts + size_t gridSize[3] = { 1, 1, 1 }; + //! Per work group (CUDA block) thread counts + size_t blockSize[3] = { 1, 1, 1 }; + //! Shared memory size in bytes + size_t sharedMemorySize = 0; }; /*! \brief Sets whether device code can use arrays that are embedded in structs. diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h index 5e36d91ffd..ada961aa04 100644 --- a/src/gromacs/gpu_utils/oclutils.h +++ b/src/gromacs/gpu_utils/oclutils.h @@ -233,12 +233,14 @@ void* prepareGpuKernelArguments(cl_kernel kernel, const KernelLaunchConfig& conf * * \param[in] kernel Kernel function handle * \param[in] config Kernel configuration for launching + * \param[in] deviceStream GPU stream to launch kernel in * \param[in] timingEvent Timing event, fetched from GpuRegionTimer * \param[in] kernelName Human readable kernel description, for error handling only * \throws gmx::InternalError on kernel launch failure */ inline void launchGpuKernel(cl_kernel kernel, const KernelLaunchConfig& config, + const DeviceStream& deviceStream, CommandEvent* timingEvent, const char* kernelName, const void* /*kernelArgs*/) @@ -252,9 +254,9 @@ inline void launchGpuKernel(cl_kernel kernel, { globalWorkSize[i] = config.gridSize[i] * config.blockSize[i]; } - cl_int clError = clEnqueueNDRangeKernel(config.stream, kernel, workDimensions, globalWorkOffset, - globalWorkSize, config.blockSize, waitListSize, - waitList, timingEvent); + cl_int clError = clEnqueueNDRangeKernel(deviceStream.stream(), kernel, workDimensions, + globalWorkOffset, globalWorkSize, config.blockSize, + waitListSize, waitList, timingEvent); if (CL_SUCCESS != clError) { const std::string errorMessage = "GPU kernel (" + std::string(kernelName) diff --git a/src/gromacs/gpu_utils/tests/typecasts_runner.cu b/src/gromacs/gpu_utils/tests/typecasts_runner.cu index 682035bb5f..d38212a28b 100644 --- a/src/gromacs/gpu_utils/tests/typecasts_runner.cu +++ b/src/gromacs/gpu_utils/tests/typecasts_runner.cu @@ -132,12 +132,12 @@ void convertRVecToFloat3OnDevice(std::vector& h_rVecOutput, const std kernelLaunchConfig.blockSize[1] = 1; kernelLaunchConfig.blockSize[2] = 1; kernelLaunchConfig.sharedMemorySize = 0; - kernelLaunchConfig.stream = deviceStream.stream(); auto kernelPtr = convertRVecToFloat3OnDevice_kernel; const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, kernelLaunchConfig, &d_float3Output, &d_rVecInput, &numElements); - launchGpuKernel(kernelPtr, kernelLaunchConfig, nullptr, "convertRVecToFloat3OnDevice_kernel", kernelArgs); + launchGpuKernel(kernelPtr, kernelLaunchConfig, deviceStream, nullptr, + "convertRVecToFloat3OnDevice_kernel", kernelArgs); copyFromDeviceBuffer(h_float3Output.data(), &d_float3Output, 0, numElements, deviceStream, GpuApiCallBehavior::Sync, nullptr); diff --git a/src/gromacs/listed_forces/gpubondedkernels.cu b/src/gromacs/listed_forces/gpubondedkernels.cu index e03a3f1fa1..9e2e23bb08 100644 --- a/src/gromacs/listed_forces/gpubondedkernels.cu +++ b/src/gromacs/listed_forces/gpubondedkernels.cu @@ -850,7 +850,6 @@ void GpuBonded::Impl::launchKernel(const t_forcerec* fr, const matrix box) config.gridSize[0] = (fTypeRangeEnd + TPB_BONDED) / TPB_BONDED; config.gridSize[1] = 1; config.gridSize[2] = 1; - config.stream = deviceStream_.stream(); auto kernelPtr = exec_kernel_gpu; kernelParams_.scaleFactor = fr->ic->epsfac * fr->fudgeQQ; @@ -858,7 +857,8 @@ void GpuBonded::Impl::launchKernel(const t_forcerec* fr, const matrix box) const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, &kernelParams_); - launchGpuKernel(kernelPtr, config, nullptr, "exec_kernel_gpu", kernelArgs); + launchGpuKernel(kernelPtr, config, deviceStream_, nullptr, "exec_kernel_gpu", + kernelArgs); } void GpuBonded::launchKernel(const t_forcerec* fr, const gmx::StepWorkload& stepWork, const matrix box) diff --git a/src/gromacs/mdlib/leapfrog_gpu.cu b/src/gromacs/mdlib/leapfrog_gpu.cu index a7e19c922d..7e8287235f 100644 --- a/src/gromacs/mdlib/leapfrog_gpu.cu +++ b/src/gromacs/mdlib/leapfrog_gpu.cu @@ -311,7 +311,7 @@ void LeapFrogGpu::integrate(const float3* d_x, const auto kernelArgs = prepareGpuKernelArguments( kernelPtr, kernelLaunchConfig_, &numAtoms_, &d_x, &d_xp, &d_v, &d_f, &d_inverseMasses_, &dt, &d_lambdas_, &d_tempScaleGroups_, &prVelocityScalingMatrixDiagonal_); - launchGpuKernel(kernelPtr, kernelLaunchConfig_, nullptr, "leapfrog_kernel", kernelArgs); + launchGpuKernel(kernelPtr, kernelLaunchConfig_, deviceStream_, nullptr, "leapfrog_kernel", kernelArgs); return; } @@ -328,7 +328,6 @@ LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, const DeviceStream& kernelLaunchConfig_.blockSize[1] = 1; kernelLaunchConfig_.blockSize[2] = 1; kernelLaunchConfig_.sharedMemorySize = 0; - kernelLaunchConfig_.stream = deviceStream_.stream(); } LeapFrogGpu::~LeapFrogGpu() diff --git a/src/gromacs/mdlib/lincs_gpu.cu b/src/gromacs/mdlib/lincs_gpu.cu index 61bc717852..e358de8cf8 100644 --- a/src/gromacs/mdlib/lincs_gpu.cu +++ b/src/gromacs/mdlib/lincs_gpu.cu @@ -475,14 +475,14 @@ void LincsGpu::apply(const float3* d_x, { config.sharedMemorySize = c_threadsPerBlock * 3 * sizeof(float); } - config.stream = deviceStream_.stream(); kernelParams_.pbcAiuc = pbcAiuc; const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, &kernelParams_, &d_x, &d_xp, &d_v, &invdt); - launchGpuKernel(kernelPtr, config, nullptr, "lincs_kernel", kernelArgs); + launchGpuKernel(kernelPtr, config, deviceStream_, nullptr, + "lincs_kernel", kernelArgs); if (computeVirial) { diff --git a/src/gromacs/mdlib/settle_gpu.cu b/src/gromacs/mdlib/settle_gpu.cu index 76daf34c1a..67d5a1b182 100644 --- a/src/gromacs/mdlib/settle_gpu.cu +++ b/src/gromacs/mdlib/settle_gpu.cu @@ -455,13 +455,13 @@ void SettleGpu::apply(const float3* d_x, { config.sharedMemorySize = 0; } - config.stream = deviceStream_.stream(); const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, &numSettles_, &d_atomIds_, &settleParameters_, &d_x, &d_xp, &invdt, &d_v, &d_virialScaled_, &pbcAiuc); - launchGpuKernel(kernelPtr, config, nullptr, "settle_kernel", kernelArgs); + launchGpuKernel(kernelPtr, config, deviceStream_, nullptr, + "settle_kernel", kernelArgs); if (computeVirial) { diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu index eed9e44d63..562c1be500 100644 --- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu +++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu @@ -158,8 +158,8 @@ void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix) const auto kernelArgs = prepareGpuKernelArguments( scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, &numAtoms_, &d_x_, &mu); - launchGpuKernel(scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, nullptr, - "scaleCoordinates_kernel", kernelArgs); + launchGpuKernel(scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, deviceStream_, + nullptr, "scaleCoordinates_kernel", kernelArgs); // TODO: Although this only happens on the pressure coupling steps, this synchronization // can affect the perfornamce if nstpcouple is small. deviceStream_.synchronize(); @@ -185,7 +185,6 @@ UpdateConstrainGpu::Impl::Impl(const t_inputrec& ir, coordinateScalingKernelLaunchConfig_.blockSize[1] = 1; coordinateScalingKernelLaunchConfig_.blockSize[2] = 1; coordinateScalingKernelLaunchConfig_.sharedMemorySize = 0; - coordinateScalingKernelLaunchConfig_.stream = deviceStream_.stream(); } UpdateConstrainGpu::Impl::~Impl() {} diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 54ce9f331b..f674c9259a 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -545,7 +545,6 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In config.blockSize[2] = num_threads_z; config.gridSize[0] = nblock; config.sharedMemorySize = calc_shmem_required_nonbonded(num_threads_z, nb->deviceInfo, nbp); - config.stream = deviceStream.stream(); if (debug) { @@ -564,7 +563,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const In (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune), nb->deviceInfo); const auto kernelArgs = prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &stepWork.computeVirial); - launchGpuKernel(kernel, config, timingEvent, "k_calc_nb", kernelArgs); + launchGpuKernel(kernel, config, deviceStream, timingEvent, "k_calc_nb", kernelArgs); if (bDoTime) { @@ -669,7 +668,6 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c config.blockSize[2] = num_threads_z; config.gridSize[0] = nblock; config.sharedMemorySize = calc_shmem_required_prune(num_threads_z); - config.stream = deviceStream.stream(); if (debug) { @@ -687,7 +685,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c const auto kernel = plist->haveFreshList ? nbnxn_kernel_prune_cuda : nbnxn_kernel_prune_cuda; const auto kernelArgs = prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &numParts, &part); - launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs); + launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs); /* TODO: consider a more elegant way to track which kernel has been called (combined or separate 1st pass prune, rolling prune). */ @@ -860,7 +858,6 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, GMX_ASSERT(config.gridSize[0] > 0, "Can not have empty grid, early return above avoids this"); config.sharedMemorySize = 0; - config.stream = deviceStream.stream(); auto kernelFn = setFillerCoords ? nbnxn_gpu_x_to_nbat_x_kernel : nbnxn_gpu_x_to_nbat_x_kernel; @@ -872,7 +869,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &numColumns, &d_xq, &d_xFloat3, &d_atomIndices, &d_cxy_na, &d_cxy_ind, &cellOffset, &numAtomsPerCell); - launchGpuKernel(kernelFn, config, nullptr, "XbufferOps", kernelArgs); + launchGpuKernel(kernelFn, config, deviceStream, nullptr, "XbufferOps", kernelArgs); } // TODO: note that this is not necessary when there astreamre no local atoms, that is: @@ -926,7 +923,6 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLo config.gridSize[1] = 1; config.gridSize[2] = 1; config.sharedMemorySize = 0; - config.stream = deviceStream.stream(); auto kernelFn = accumulateForce ? nbnxn_gpu_add_nbat_f_to_f_kernel : nbnxn_gpu_add_nbat_f_to_f_kernel; @@ -946,7 +942,7 @@ void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLo const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &d_fNB, &d_fPme, &d_fTotal, &d_cell, &atomStart, &numAtoms); - launchGpuKernel(kernelFn, config, nullptr, "FbufferOps", kernelArgs); + launchGpuKernel(kernelFn, config, deviceStream, nullptr, "FbufferOps", kernelArgs); if (atomLocality == AtomLocality::Local) { diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index ba0c2ee939..e4d571e943 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -635,7 +635,6 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb KernelLaunchConfig config; config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam); - config.stream = deviceStream.stream(); config.blockSize[0] = c_clSize; config.blockSize[1] = c_clSize; config.gridSize[0] = plist->nsci; @@ -672,7 +671,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb &adat->fshift, &adat->lj_comb, &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d, &plist->sci, &plist->cj4, &plist->excl, &computeFshift); - launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs); + launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs); } else { @@ -681,7 +680,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb &adat->e_el, &adat->fshift, &adat->atom_types, &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d, &plist->sci, &plist->cj4, &plist->excl, &computeFshift); - launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs); + launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs); } if (bDoTime) @@ -795,7 +794,6 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c /* kernel launch config */ KernelLaunchConfig config; config.sharedMemorySize = calc_shmem_required_prune(num_threads_z); - config.stream = deviceStream.stream(); config.blockSize[0] = c_clSize; config.blockSize[1] = c_clSize; config.blockSize[2] = num_threads_z; @@ -824,7 +822,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c const auto kernelArgs = prepareGpuKernelArguments(pruneKernel, config, &nbparams_params, &adat->xq, &adat->shift_vec, &plist->sci, &plist->cj4, &plist->imask, &numParts, &part); - launchGpuKernel(pruneKernel, config, timingEvent, kernelName, kernelArgs); + launchGpuKernel(pruneKernel, config, deviceStream, timingEvent, kernelName, kernelArgs); if (plist->haveFreshList) { -- 2.22.0