From: Aleksei Iupinov Date: Mon, 19 Feb 2018 18:25:34 +0000 (+0100) Subject: prepareGpuKernelArguments() and launchGpuKernel() are added X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=58ec6661f1e59ce55c0e525400a0aa2aa8888106;p=alexxy%2Fgromacs.git prepareGpuKernelArguments() and launchGpuKernel() are added Template functions are added for preparing/packing GPU kernel arguments and launching GPU kernels. This allows to hide CUDA/OpenCL boilerplate. launchGpuKernel() throws InternalError on failure. GPU kernels scheduling details are unified into a KernelLaunchConfig struct. The new functions are applied in NB as well as PME (facilitating PME OpenCL porting). Change-Id: I90205cf1cda93c377a342b7f1c46a8caf239ba65 --- diff --git a/src/gromacs/ewald/pme-gather.cu b/src/gromacs/ewald/pme-gather.cu index 7dc096fc24..71d1687926 100644 --- a/src/gromacs/ewald/pme-gather.cu +++ b/src/gromacs/ewald/pme-gather.cu @@ -428,7 +428,6 @@ void pme_gpu_gather(PmeGpu *pmeGpu, pme_gpu_copy_input_forces(pmeGpu); } - cudaStream_t stream = pmeGpu->archSpecific->pmeStream; const int order = pmeGpu->common->pme_order; const auto *kernelParamsPtr = pmeGpu->kernelParams.get(); @@ -445,35 +444,38 @@ void pme_gpu_gather(PmeGpu *pmeGpu, const int atomsPerBlock = (c_gatherMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM); GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. gathering block size"); - const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock; - auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount); - dim3 dimBlock(order, order, atomsPerBlock); + const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock; + auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount); - const bool wrapX = true; - const bool wrapY = true; + KernelLaunchConfig config; + config.blockSize[0] = config.blockSize[1] = order; + config.blockSize[2] = atomsPerBlock; + config.gridSize[0] = dimGrid.x; + config.gridSize[1] = dimGrid.y; + config.stream = pmeGpu->archSpecific->pmeStream; + + if (order != 4) + { + GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not implemented!")); + } + + constexpr bool wrapX = true; + constexpr bool wrapY = true; GMX_UNUSED_VALUE(wrapX); GMX_UNUSED_VALUE(wrapY); // TODO test different cache configs - pme_gpu_start_timing(pmeGpu, gtPME_GATHER); - if (order == 4) - { - if (forceTreatment == PmeForceOutputHandling::Set) - { - pme_gather_kernel<4, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr); - } - else - { - pme_gather_kernel<4, false, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr); - } - } - else - { - GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 is not implemented")); - } - CU_LAUNCH_ERR("pme_gather_kernel"); - pme_gpu_stop_timing(pmeGpu, gtPME_GATHER); + int timingId = gtPME_GATHER; + void (*kernelPtr)(const PmeGpuCudaKernelParams) = (forceTreatment == PmeForceOutputHandling::Set) ? + pme_gather_kernel<4, true, wrapX, wrapY> : + pme_gather_kernel<4, false, wrapX, wrapY>; + + pme_gpu_start_timing(pmeGpu, timingId); + auto *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId); + const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr); + launchGpuKernel(kernelPtr, config, timingEvent, "PME gather", kernelArgs); + pme_gpu_stop_timing(pmeGpu, timingId); pme_gpu_copy_output_forces(pmeGpu); } diff --git a/src/gromacs/ewald/pme-solve.cu b/src/gromacs/ewald/pme-solve.cu index f0f55df576..6a035dd410 100644 --- a/src/gromacs/ewald/pme-solve.cu +++ b/src/gromacs/ewald/pme-solve.cu @@ -445,37 +445,36 @@ void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid, const int blocksPerGridLine = (gridLineSize + maxBlockSize - 1) / maxBlockSize; const int cellsPerBlock = gridLineSize * gridLinesPerBlock; const int blockSize = (cellsPerBlock + warp_size - 1) / warp_size * warp_size; + + + KernelLaunchConfig config; + config.blockSize[0] = blockSize; + config.gridSize[0] = blocksPerGridLine; // rounding up to full warps so that shuffle operations produce defined results - dim3 threads(blockSize); - dim3 blocks(blocksPerGridLine, - (pmeGpu->kernelParams->grid.complexGridSize[middleDim] + gridLinesPerBlock - 1) / gridLinesPerBlock, - pmeGpu->kernelParams->grid.complexGridSize[majorDim]); + config.gridSize[1] = (pmeGpu->kernelParams->grid.complexGridSize[middleDim] + gridLinesPerBlock - 1) / gridLinesPerBlock; + config.gridSize[2] = pmeGpu->kernelParams->grid.complexGridSize[majorDim]; + config.stream = pmeGpu->archSpecific->pmeStream; - pme_gpu_start_timing(pmeGpu, gtPME_SOLVE); + int timingId = gtPME_SOLVE; + void (*kernelPtr)(const PmeGpuCudaKernelParams) = nullptr; if (gridOrdering == GridOrdering::YZX) { - if (computeEnergyAndVirial) - { - pme_solve_kernel <<< blocks, threads, 0, stream>>> (*kernelParamsPtr); - } - else - { - pme_solve_kernel <<< blocks, threads, 0, stream>>> (*kernelParamsPtr); - } + kernelPtr = computeEnergyAndVirial ? + pme_solve_kernel : + pme_solve_kernel; } else if (gridOrdering == GridOrdering::XYZ) { - if (computeEnergyAndVirial) - { - pme_solve_kernel <<< blocks, threads, 0, stream>>> (*kernelParamsPtr); - } - else - { - pme_solve_kernel <<< blocks, threads, 0, stream>>> (*kernelParamsPtr); - } + kernelPtr = computeEnergyAndVirial ? + pme_solve_kernel : + pme_solve_kernel; } - CU_LAUNCH_ERR("pme_solve_kernel"); - pme_gpu_stop_timing(pmeGpu, gtPME_SOLVE); + + pme_gpu_start_timing(pmeGpu, timingId); + auto *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId); + const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr); + launchGpuKernel(kernelPtr, config, timingEvent, "PME solve", kernelArgs); + pme_gpu_stop_timing(pmeGpu, timingId); if (computeEnergyAndVirial) { diff --git a/src/gromacs/ewald/pme-spread.cu b/src/gromacs/ewald/pme-spread.cu index 6574894146..89df2c77f9 100644 --- a/src/gromacs/ewald/pme-spread.cu +++ b/src/gromacs/ewald/pme-spread.cu @@ -489,7 +489,6 @@ void pme_gpu_spread(const PmeGpu *pmeGpu, bool spreadCharges) { GMX_ASSERT(computeSplines || spreadCharges, "PME spline/spread kernel has invalid input (nothing to do)"); - cudaStream_t stream = pmeGpu->archSpecific->pmeStream; const auto *kernelParamsPtr = pmeGpu->kernelParams.get(); GMX_ASSERT(kernelParamsPtr->atoms.nAtoms > 0, "No atom data in PME GPU spread"); @@ -503,50 +502,53 @@ void pme_gpu_spread(const PmeGpu *pmeGpu, //(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define) GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. spreading block size"); - const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock; - auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount); - dim3 dimBlock(order, order, atomsPerBlock); + const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock; + auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount); + + KernelLaunchConfig config; + config.blockSize[0] = config.blockSize[1] = order; + config.blockSize[2] = atomsPerBlock; + config.gridSize[0] = dimGrid.x; + config.gridSize[1] = dimGrid.y; + config.stream = pmeGpu->archSpecific->pmeStream; + + if (order != 4) + { + GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not implemented!")); + } // These should later check for PME decomposition - const bool wrapX = true; - const bool wrapY = true; + constexpr bool wrapX = true; + constexpr bool wrapY = true; GMX_UNUSED_VALUE(wrapX); GMX_UNUSED_VALUE(wrapY); - switch (order) + + int timingId; + void (*kernelPtr)(const PmeGpuCudaKernelParams) = nullptr; + if (computeSplines) { - case 4: + if (spreadCharges) { - // TODO: cleaner unroll with some template trick? - if (computeSplines) - { - if (spreadCharges) - { - pme_gpu_start_timing(pmeGpu, gtPME_SPLINEANDSPREAD); - pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr); - CU_LAUNCH_ERR("pme_spline_and_spread_kernel"); - pme_gpu_stop_timing(pmeGpu, gtPME_SPLINEANDSPREAD); - } - else - { - pme_gpu_start_timing(pmeGpu, gtPME_SPLINE); - pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr); - CU_LAUNCH_ERR("pme_spline_and_spread_kernel"); - pme_gpu_stop_timing(pmeGpu, gtPME_SPLINE); - } - } - else - { - pme_gpu_start_timing(pmeGpu, gtPME_SPREAD); - pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr); - CU_LAUNCH_ERR("pme_spline_and_spread_kernel"); - pme_gpu_stop_timing(pmeGpu, gtPME_SPREAD); - } + timingId = gtPME_SPLINEANDSPREAD; + kernelPtr = pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY>; + } + else + { + timingId = gtPME_SPLINE; + kernelPtr = pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY>; } - break; - - default: - GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not tested!")); } + else + { + timingId = gtPME_SPREAD; + kernelPtr = pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY>; + } + + pme_gpu_start_timing(pmeGpu, timingId); + auto *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId); + const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr); + launchGpuKernel(kernelPtr, config, timingEvent, "PME spline/spread", kernelArgs); + pme_gpu_stop_timing(pmeGpu, timingId); const bool copyBackGrid = spreadCharges && (pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_FFT(pmeGpu)); if (copyBackGrid) diff --git a/src/gromacs/ewald/pme-timings.cu b/src/gromacs/ewald/pme-timings.cu index f806e803a7..306ad2392b 100644 --- a/src/gromacs/ewald/pme-timings.cu +++ b/src/gromacs/ewald/pme-timings.cu @@ -67,6 +67,17 @@ void pme_gpu_start_timing(const PmeGpu *pmeGpu, size_t PMEStageId) } } +CommandEvent *pme_gpu_fetch_timing_event(const PmeGpu *pmeGpu, size_t PMEStageId) +{ + CommandEvent *timingEvent = nullptr; + if (pme_gpu_timings_enabled(pmeGpu)) + { + GMX_ASSERT(PMEStageId < pmeGpu->archSpecific->timingEvents.size(), "Wrong PME GPU timing event index"); + timingEvent = pmeGpu->archSpecific->timingEvents[PMEStageId].fetchNextEvent(); + } + return timingEvent; +} + void pme_gpu_stop_timing(const PmeGpu *pmeGpu, size_t PMEStageId) { if (pme_gpu_timings_enabled(pmeGpu)) diff --git a/src/gromacs/ewald/pme-timings.cuh b/src/gromacs/ewald/pme-timings.cuh index 61dcba58d4..4df40b17e5 100644 --- a/src/gromacs/ewald/pme-timings.cuh +++ b/src/gromacs/ewald/pme-timings.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -55,6 +55,15 @@ struct PmeGpu; */ void pme_gpu_start_timing(const PmeGpu *pmeGpu, size_t PMEStageId); +/*! \libinternal \brief + * Returns raw timing event from the corresponding GpuRegionTimer (if timings are enabled). + * In CUDA result can be nullptr stub, per GpuRegionTimer implementation. + * + * \param[in] pmeGpu The PME GPU data structure. + * \param[in] PMEStageId The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h + */ +CommandEvent *pme_gpu_fetch_timing_event(const PmeGpu *pmeGpu, size_t PMEStageId); + /*! \libinternal \brief * Stops timing the certain PME GPU stage during a single computation (if timings are enabled). * diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh index 945d7bb1d7..6eabc3ce14 100644 --- a/src/gromacs/gpu_utils/cudautils.cuh +++ b/src/gromacs/gpu_utils/cudautils.cuh @@ -42,10 +42,13 @@ #include #endif /* HAVE_NVML */ +#include #include +#include "gromacs/gpu_utils/gputraits.cuh" #include "gromacs/math/vec.h" #include "gromacs/math/vectypes.h" +#include "gromacs/utility/exceptions.h" #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/gmxassert.h" #include "gromacs/utility/stringutil.h" @@ -123,32 +126,10 @@ enum class GpuApiCallBehavior; } \ } while (0) -/*! Check for any previously occurred uncaught CUDA error - -- aimed at use after kernel calls. */ -#define CU_LAUNCH_ERR(msg) \ - do { \ - cudaError_t _CU_LAUNCH_ERR_status = cudaGetLastError(); \ - if (_CU_LAUNCH_ERR_status != cudaSuccess) { \ - gmx_fatal(FARGS, "Error while launching kernel %s: %s\n", msg, cudaGetErrorString(_CU_LAUNCH_ERR_status)); \ - } \ - } while (0) - -/*! Synchronize with GPU and check for any previously occurred uncaught CUDA error - -- aimed at use after kernel calls. */ -#define CU_LAUNCH_ERR_SYNC(msg) \ - do { \ - cudaError_t _CU_SYNC_LAUNCH_ERR_status = cudaThreadSynchronize(); \ - if (_CU_SYNC_LAUNCH_ERR_status != cudaSuccess) { \ - gmx_fatal(FARGS, "Error while launching kernel %s: %s\n", msg, cudaGetErrorString(_CU_SYNC_LAUNCH_ERR_status)); \ - } \ - } while (0) - #else /* CHECK_CUDA_ERRORS */ #define CU_RET_ERR(status, msg) do { } while (0) #define CU_CHECK_PREV_ERR() do { } while (0) -#define CU_LAUNCH_ERR(msg) do { } while (0) -#define CU_LAUNCH_ERR_SYNC(msg) do { } while (0) #define HANDLE_NVML_RET_ERR(status, msg) do { } while (0) #endif /* CHECK_CUDA_ERRORS */ @@ -286,4 +267,97 @@ static inline bool haveStreamTasksCompleted(cudaStream_t s) return true; } +/* Kernel launch helpers */ + +/*! \brief + * A function for setting up a single CUDA kernel argument. + * This is the tail of the compile-time recursive function below. + * It has to be seen by the compiler first. + * + * \tparam totalArgsCount Number of the kernel arguments + * \tparam KernelPtr Kernel function handle type + * \param[in] argIndex Index of the current argument + */ +template +void prepareGpuKernelArgument(KernelPtr /*kernel*/, + std::array */* kernelArgsPtr */, + size_t gmx_used_in_debug argIndex) +{ + GMX_ASSERT(argIndex == totalArgsCount, "Tail expansion"); +} + +/*! \brief + * Compile-time recursive function for setting up a single CUDA kernel argument. + * This function copies a kernel argument pointer \p argPtr into \p kernelArgsPtr, + * and calls itself on the next argument, eventually calling the tail function above. + * + * \tparam CurrentArg Type of the current argument + * \tparam RemainingArgs Types of remaining arguments after the current one + * \tparam totalArgsCount Number of the kernel arguments + * \tparam KernelPtr Kernel function handle type + * \param[in] kernel Kernel function handle + * \param[in,out] kernelArgsPtr Pointer to the argument array to be filled in + * \param[in] argIndex Index of the current argument + * \param[in] argPtr Pointer to the current argument + * \param[in] otherArgsPtrs Pack of pointers to arguments remaining to process after the current one + */ +template +void prepareGpuKernelArgument(KernelPtr kernel, + std::array *kernelArgsPtr, + size_t argIndex, + const CurrentArg *argPtr, + const RemainingArgs *... otherArgsPtrs) +{ + (*kernelArgsPtr)[argIndex] = (void *)argPtr; + prepareGpuKernelArgument(kernel, kernelArgsPtr, argIndex + 1, otherArgsPtrs ...); +} + +/*! \brief + * A wrapper function for setting up all the CUDA kernel arguments. + * Calls the recursive functions above. + * + * \tparam Args Types of all the kernel arguments + * \param[in] kernel Kernel function handle + * \param[in] argsPtrs Pointers to all the kernel arguments + * \returns A prepared parameter pack to be used with launchGpuKernel() as the last argument. + */ +template +std::array prepareGpuKernelArguments(void (*kernel)(Args...), + const KernelLaunchConfig & /*config */, + const Args *... argsPtrs) +{ + std::array kernelArgs; + prepareGpuKernelArgument(kernel, &kernelArgs, 0, argsPtrs ...); + return kernelArgs; +} + +/*! \brief Launches the CUDA kernel and handles the errors. + * + * \tparam Args Types of all the kernel arguments + * \param[in] kernel Kernel function handle + * \param[in] config Kernel configuration for launching + * \param[in] kernelName Human readable kernel description, for error handling only + * \param[in] kernelArgs Array of the pointers to the kernel arguments, prepared by prepareGpuKernelArguments() + * \throws gmx::InternalError on kernel launch failure + */ +template +void launchGpuKernel(void (*kernel)(Args...), + const KernelLaunchConfig &config, + CommandEvent */*timingEvent */, + const char *kernelName, + const std::array &kernelArgs) +{ + dim3 blockSize(config.blockSize[0], config.blockSize[1], config.blockSize[2]); + dim3 gridSize(config.gridSize[0], config.gridSize[1], config.gridSize[2]); + cudaLaunchKernel((void *)kernel, gridSize, blockSize, const_cast(kernelArgs.data()), config.sharedMemorySize, config.stream); + + cudaError_t status = cudaGetLastError(); + if (cudaSuccess != status) + { + const std::string errorMessage = "GPU kernel (" + std::string(kernelName) + + ") failed to launch: " + std::string(cudaGetErrorString(status)); + GMX_THROW(gmx::InternalError(errorMessage)); + } +} + #endif diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu index c5e367884d..4ed5cbefc1 100644 --- a/src/gromacs/gpu_utils/gpu_utils.cu +++ b/src/gromacs/gpu_utils/gpu_utils.cu @@ -254,7 +254,10 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop) } /* try to execute a dummy kernel */ - k_dummy_test<<< 1, 512>>> (); + KernelLaunchConfig config; + config.blockSize[0] = 512; + const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config); + launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments); if (cudaThreadSynchronize() != cudaSuccess) { return -1; diff --git a/src/gromacs/gpu_utils/gputraits.cuh b/src/gromacs/gpu_utils/gputraits.cuh index 323f80eb9a..a4406e1943 100644 --- a/src/gromacs/gpu_utils/gputraits.cuh +++ b/src/gromacs/gpu_utils/gputraits.cuh @@ -49,4 +49,17 @@ using CommandEvent = void; //! \brief Context used explicitly in OpenCL, does nothing in CUDA using Context = void *; +/*! \internal \brief + * GPU kernels scheduling description. This is same in OpenCL/CUDA. + * Provides reasonable defaults, one typically only needs to set the GPU stream + * and non-1 work sizes. + */ +struct KernelLaunchConfig +{ + size_t gridSize[3] = {1, 1, 1}; //!< Block counts + size_t blockSize[3] = {1, 1, 1}; //!< Per-block thread counts + size_t sharedMemorySize = 0; //!< Shared memory size in bytes + CommandStream stream = nullptr; //!< Stream to launch kernel in +}; + #endif diff --git a/src/gromacs/gpu_utils/gputraits_ocl.h b/src/gromacs/gpu_utils/gputraits_ocl.h index adf6f9508f..0a2301e38f 100644 --- a/src/gromacs/gpu_utils/gputraits_ocl.h +++ b/src/gromacs/gpu_utils/gputraits_ocl.h @@ -51,4 +51,17 @@ using CommandEvent = cl_event; //! \brief Context used explicitly in OpenCL using Context = cl_context; +/*! \internal \brief + * GPU kernels scheduling description. This is same in OpenCL/CUDA. + * Provides reasonable defaults, one typically only needs to set the GPU stream + * and non-1 work sizes. + */ +struct KernelLaunchConfig +{ + size_t gridSize[3] = {1, 1, 1}; //!< Work groups (CUDA blocks) counts + size_t blockSize[3] = {1, 1, 1}; //!< Per work group (CUDA block) thread counts + size_t sharedMemorySize = 0; //!< Shared memory size in bytes + CommandStream stream = nullptr; //!< Stream to launch kernel in +}; + #endif diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h index afe3238875..ef7de9aa54 100644 --- a/src/gromacs/gpu_utils/oclutils.h +++ b/src/gromacs/gpu_utils/oclutils.h @@ -44,6 +44,8 @@ #include #include "gromacs/gpu_utils/gmxopencl.h" +#include "gromacs/gpu_utils/gputraits_ocl.h" +#include "gromacs/utility/exceptions.h" #include "gromacs/utility/gmxassert.h" enum class GpuApiCallBehavior; @@ -177,4 +179,107 @@ static inline bool haveStreamTasksCompleted(cl_command_queue gmx_unused s) return false; } +/* Kernel launch helpers */ + +/*! \brief + * A function for setting up a single OpenCL kernel argument. + * This is the tail of the compile-time recursive function below. + * It has to be seen by the compiler first. + * As NB kernels might be using dynamic local memory as the last argument, + * this function also manages that, using sharedMemorySize from \p config. + * + * \param[in] kernel Kernel function handle + * \param[in] config Kernel configuration for launching + * \param[in] argIndex Index of the current argument + */ +void inline prepareGpuKernelArgument(cl_kernel kernel, + const KernelLaunchConfig &config, + size_t argIndex) +{ + if (config.sharedMemorySize > 0) + { + cl_int gmx_used_in_debug clError = clSetKernelArg(kernel, argIndex, config.sharedMemorySize, nullptr); + GMX_ASSERT(CL_SUCCESS == clError, ocl_get_error_string(clError).c_str()); + } +} + +/*! \brief + * Compile-time recursive function for setting up a single OpenCL kernel argument. + * This function uses one kernel argument pointer \p argPtr to call clSetKernelArg(), + * and calls itself on the next argument, eventually calling the tail function above. + * + * \tparam CurrentArg Type of the current argument + * \tparam RemainingArgs Types of remaining arguments after the current one + * \param[in] kernel Kernel function handle + * \param[in] config Kernel configuration for launching + * \param[in] argIndex Index of the current argument + * \param[in] argPtr Pointer to the current argument + * \param[in] otherArgsPtrs Pack of pointers to arguments remaining to process after the current one + */ +template +void prepareGpuKernelArgument(cl_kernel kernel, + const KernelLaunchConfig &config, + size_t argIndex, + const CurrentArg *argPtr, + const RemainingArgs *... otherArgsPtrs) +{ + cl_int gmx_used_in_debug clError = clSetKernelArg(kernel, argIndex, sizeof(CurrentArg), argPtr); + GMX_ASSERT(CL_SUCCESS == clError, ocl_get_error_string(clError).c_str()); + + prepareGpuKernelArgument(kernel, config, argIndex + 1, otherArgsPtrs ...); +} + +/*! \brief + * A wrapper function for setting up all the OpenCL kernel arguments. + * Calls the recursive functions above. + * + * \tparam Args Types of all the kernel arguments + * \param[in] kernel Kernel function handle + * \param[in] config Kernel configuration for launching + * \param[in] argsPtrs Pointers to all the kernel arguments + * \returns A handle for the prepared parameter pack to be used with launchGpuKernel() as the last argument + * - currently always nullptr for OpenCL, as it manages kernel/arguments association by itself. + */ +template +void *prepareGpuKernelArguments(cl_kernel kernel, + const KernelLaunchConfig &config, + const Args *... argsPtrs) +{ + prepareGpuKernelArgument(kernel, config, 0, argsPtrs ...); + return nullptr; +} + +/*! \brief Launches the OpenCL kernel and handles the errors. + * + * \param[in] kernel Kernel function handle + * \param[in] config Kernel configuration for launching + * \param[in] timingEvent Timing event, fetched from GpuRegionTimer + * \param[in] kernelName Human readable kernel description, for error handling only + * \throws gmx::InternalError on kernel launch failure + */ +inline void launchGpuKernel(cl_kernel kernel, + const KernelLaunchConfig &config, + CommandEvent *timingEvent, + const char *kernelName, + const void * /*kernelArgs*/) +{ + const int workDimensions = 3; + const size_t *globalWorkOffset = nullptr; + const size_t waitListSize = 0; + const cl_event *waitList = nullptr; + size_t globalWorkSize[3]; + for (int i = 0; i < workDimensions; i++) + { + globalWorkSize[i] = config.gridSize[i] * config.blockSize[i]; + } + cl_int clError = clEnqueueNDRangeKernel(config.stream, kernel, workDimensions, globalWorkOffset, + globalWorkSize, config.blockSize, waitListSize, waitList, timingEvent); + if (CL_SUCCESS != clError) + { + const std::string errorMessage = "GPU kernel (" + std::string(kernelName) + + ") failed to launch: " + ocl_get_error_string(clError); + GMX_THROW(gmx::InternalError(errorMessage)); + } +} + #endif diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu index e18c197235..ecfccb8621 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu @@ -299,7 +299,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t *nb, cudaError_t stat; int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ /* CUDA kernel launch-related stuff */ - int shmem, nblock; + int nblock; dim3 dim_block, dim_grid; nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */ @@ -412,29 +412,29 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t *nb, num_threads_z = 2; } nblock = calc_nb_kernel_nblock(plist->nsci, nb->dev_info); - dim_block = dim3(c_clSize, c_clSize, num_threads_z); - dim_grid = dim3(nblock, 1, 1); - shmem = calc_shmem_required_nonbonded(num_threads_z, nb->dev_info, nbp); + + KernelLaunchConfig config; + config.blockSize[0] = c_clSize; + config.blockSize[1] = c_clSize; + config.blockSize[2] = num_threads_z; + config.gridSize[0] = nblock; + config.sharedMemorySize = calc_shmem_required_nonbonded(num_threads_z, nb->dev_info, nbp); + config.stream = stream; if (debug) { - fprintf(debug, "Non-bonded GPU launch configuration:\n\tThread block: %ux%ux%u\n\t" - "\tGrid: %ux%u\n\t#Super-clusters/clusters: %d/%d (%d)\n" - "\tShMem: %d\n", - dim_block.x, dim_block.y, dim_block.z, - dim_grid.x, dim_grid.y, plist->nsci*c_numClPerSupercl, + fprintf(debug, "Non-bonded GPU launch configuration:\n\tThread block: %zux%zux%zu\n\t" + "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n" + "\tShMem: %zu\n", + config.blockSize[0], config.blockSize[1], config.blockSize[2], + config.gridSize[0], config.gridSize[1], plist->nsci*c_numClPerSupercl, c_numClPerSupercl, plist->na_c, - shmem); + config.sharedMemorySize); } - void* kernel_args[4]; - kernel_args[0] = adat; - kernel_args[1] = nbp; - kernel_args[2] = plist; - kernel_args[3] = &bCalcFshift; - - cudaLaunchKernel((void *)nb_kernel, dim_grid, dim_block, kernel_args, shmem, stream); - CU_LAUNCH_ERR("k_calc_nb"); + auto *timingEvent = bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr; + const auto kernelArgs = prepareGpuKernelArguments(nb_kernel, config, adat, nbp, plist, &bCalcFshift); + launchGpuKernel(nb_kernel, config, timingEvent, "k_calc_nb", kernelArgs); if (bDoTime) { @@ -531,39 +531,32 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t *nb, * and j-cluster concurrency, in x, y, and z, respectively. * - The 1D block-grid contains as many blocks as super-clusters. */ - int num_threads_z = c_cudaPruneKernelJ4Concurrency; - int nblock = calc_nb_kernel_nblock(numSciInPart, nb->dev_info); - dim3 dim_block = dim3(c_clSize, c_clSize, num_threads_z); - dim3 dim_grid = dim3(nblock, 1, 1); - int shmem = calc_shmem_required_prune(num_threads_z); + int num_threads_z = c_cudaPruneKernelJ4Concurrency; + int nblock = calc_nb_kernel_nblock(numSciInPart, nb->dev_info); + KernelLaunchConfig config; + config.blockSize[0] = c_clSize; + config.blockSize[1] = c_clSize; + config.blockSize[2] = num_threads_z; + config.gridSize[0] = nblock; + config.sharedMemorySize = calc_shmem_required_prune(num_threads_z); + config.stream = stream; if (debug) { - fprintf(debug, "Pruning GPU kernel launch configuration:\n\tThread block: %ux%ux%u\n\t" - "\tGrid: %ux%u\n\t#Super-clusters/clusters: %d/%d (%d)\n" - "\tShMem: %d\n", - dim_block.x, dim_block.y, dim_block.z, - dim_grid.x, dim_grid.y, numSciInPart*c_numClPerSupercl, + fprintf(debug, "Pruning GPU kernel launch configuration:\n\tThread block: %zux%zux%zu\n\t" + "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n" + "\tShMem: %zu\n", + config.blockSize[0], config.blockSize[1], config.blockSize[2], + config.gridSize[0], config.gridSize[1], numSciInPart*c_numClPerSupercl, c_numClPerSupercl, plist->na_c, - shmem); + config.sharedMemorySize); } - void* kernel_args[5]; - kernel_args[0] = adat; - kernel_args[1] = nbp; - kernel_args[2] = plist; - kernel_args[3] = &numParts; - kernel_args[4] = ∂ - - if (plist->haveFreshList) - { - cudaLaunchKernel((void *)nbnxn_kernel_prune_cuda, dim_grid, dim_block, kernel_args, shmem, stream); - } - else - { - cudaLaunchKernel((void *)nbnxn_kernel_prune_cuda, dim_grid, dim_block, kernel_args, shmem, stream); - } - CU_LAUNCH_ERR("k_pruneonly"); + auto *timingEvent = bDoTime ? timer->fetchNextEvent() : nullptr; + constexpr char kernelName[] = "k_pruneonly"; + const auto &kernel = plist->haveFreshList ? nbnxn_kernel_prune_cuda : nbnxn_kernel_prune_cuda; + const auto kernelArgs = prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &numParts, &part); + launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs); /* TODO: consider a more elegant way to track which kernel has been called (combined or separate 1st pass prune, rolling prune). */ diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp index 8f80c77526..af56d62424 100644 --- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp +++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp @@ -99,13 +99,20 @@ static const int c_clSize = c_nbnxnGpuClusterSize; /*! \brief Validates the input global work size parameter. */ -static inline void validate_global_work_size(size_t *global_work_size, int work_dim, const gmx_device_info_t *dinfo) +static inline void validate_global_work_size(const KernelLaunchConfig &config, int work_dim, const gmx_device_info_t *dinfo) { cl_uint device_size_t_size_bits; cl_uint host_size_t_size_bits; assert(dinfo); + size_t global_work_size[3]; + GMX_ASSERT(work_dim <= 3, "Not supporting hyper-grids just yet"); + for (int i = 0; i < work_dim; i++) + { + global_work_size[i] = config.blockSize[i] * config.gridSize[i]; + } + /* Each component of a global_work_size must not exceed the range given by the sizeof(device size_t) for the device on which the kernel execution will be enqueued. See: @@ -376,12 +383,9 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, int flags, int iloc) { - cl_int cl_error; int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ /* OpenCL kernel launch-related stuff */ - int shmem; - size_t local_work_size[3], global_work_size[3]; - cl_kernel nb_kernel = NULL; /* fn pointer to the nonbonded kernel */ + cl_kernel nb_kernel = NULL; /* fn pointer to the nonbonded kernel */ cl_atomdata_t *adat = nb->atdat; cl_nbparam_t *nbp = nb->nbparam; @@ -392,7 +396,6 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, bool bCalcEner = flags & GMX_FORCE_ENERGY; int bCalcFshift = flags & GMX_FORCE_VIRIAL; bool bDoTime = nb->bDoTime; - cl_uint arg_no; cl_nbparam_params_t nbparams_params; @@ -446,7 +449,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, { if (iloc == eintLocal) { - cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done)); + cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done)); assert(CL_SUCCESS == cl_error); /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed @@ -493,67 +496,49 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t *nb, (plist->haveFreshList && !nb->timers->didPrune[iloc])); /* kernel launch config */ - local_work_size[0] = c_clSize; - local_work_size[1] = c_clSize; - local_work_size[2] = 1; - - global_work_size[0] = plist->nsci * local_work_size[0]; - global_work_size[1] = 1 * local_work_size[1]; - global_work_size[2] = 1 * local_work_size[2]; - validate_global_work_size(global_work_size, 3, nb->dev_info); + KernelLaunchConfig config; + config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam); + config.stream = stream; + config.blockSize[0] = c_clSize; + config.blockSize[1] = c_clSize; + config.gridSize[0] = plist->nsci; - shmem = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam); + validate_global_work_size(config, 3, nb->dev_info); if (debug) { fprintf(debug, "Non-bonded GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t" "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n", - (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]), - (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*c_numClPerSupercl, + (int)(config.blockSize[0]), (int)(config.blockSize[1]), (int)(config.blockSize[2]), + (int)(config.blockSize[0] * config.gridSize[0]), (int)(config.blockSize[1] * config.gridSize[1]), plist->nsci*c_numClPerSupercl, c_numClPerSupercl, plist->na_c); } fillin_ocl_structures(nbp, &nbparams_params); - arg_no = 0; - cl_error = CL_SUCCESS; - if (!useLjCombRule(nb->nbparam->vdwtype)) - { - cl_error = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes)); - } - cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params)); - cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq)); - cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f)); - cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj)); - cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el)); - cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift)); + auto *timingEvent = bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr; + constexpr char kernelName[] = "k_calc_nb"; if (useLjCombRule(nb->nbparam->vdwtype)) { - cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->lj_comb)); + const auto kernelArgs = prepareGpuKernelArguments(nb_kernel, config, + &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift, + &adat->lj_comb, + &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d, + &plist->sci, &plist->cj4, &plist->excl, &bCalcFshift); + + launchGpuKernel(nb_kernel, config, timingEvent, kernelName, kernelArgs); } else { - cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types)); + const auto kernelArgs = prepareGpuKernelArguments(nb_kernel, config, + &adat->ntypes, + &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift, + &adat->atom_types, + &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d, + &plist->sci, &plist->cj4, &plist->excl, &bCalcFshift); + launchGpuKernel(nb_kernel, config, timingEvent, kernelName, kernelArgs); } - cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec)); - cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d)); - cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d)); - cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d)); - cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci)); - cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4)); - cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl)); - cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift); - cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL); - - assert(cl_error == CL_SUCCESS); - - if (cl_error) - { - printf("OpenCL error: %s\n", ocl_get_error_string(cl_error).c_str()); - } - cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr); - assert(cl_error == CL_SUCCESS); if (bDoTime) { @@ -591,8 +576,6 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb, int iloc, int numParts) { - cl_int cl_error; - cl_atomdata_t *adat = nb->atdat; cl_nbparam_t *nbp = nb->nbparam; cl_plist_t *plist = nb->plist[iloc]; @@ -663,50 +646,35 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb, cl_kernel pruneKernel = selectPruneKernel(nb->kernel_pruneonly, plist->haveFreshList); /* kernel launch config */ - size_t local_work_size[3], global_work_size[3]; - local_work_size[0] = c_clSize; - local_work_size[1] = c_clSize; - local_work_size[2] = num_threads_z; + KernelLaunchConfig config; + config.sharedMemorySize = calc_shmem_required_prune(num_threads_z); + config.stream = stream; + config.blockSize[0] = c_clSize; + config.blockSize[1] = c_clSize; + config.blockSize[2] = num_threads_z; + config.gridSize[0] = numSciInPart; - global_work_size[0] = numSciInPart * local_work_size[0]; - global_work_size[1] = 1 * local_work_size[1]; - global_work_size[2] = 1 * local_work_size[2]; - - validate_global_work_size(global_work_size, 3, nb->dev_info); - - int shmem = calc_shmem_required_prune(num_threads_z); + validate_global_work_size(config, 3, nb->dev_info); if (debug) { fprintf(debug, "Pruning GPU kernel launch configuration:\n\tLocal work size: %dx%dx%d\n\t" "\tGlobal work size: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n" - "\tShMem: %d\n", - (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]), - (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*c_numClPerSupercl, - c_numClPerSupercl, plist->na_c, shmem); + "\tShMem: %zu\n", + (int)(config.blockSize[0]), (int)(config.blockSize[1]), (int)(config.blockSize[2]), + (int)(config.blockSize[0] * config.gridSize[0]), (int)(config.blockSize[1] * config.gridSize[1]), plist->nsci*c_numClPerSupercl, + c_numClPerSupercl, plist->na_c, config.sharedMemorySize); } cl_nbparam_params_t nbparams_params; fillin_ocl_structures(nbp, &nbparams_params); - cl_uint arg_no = 0; - cl_error = CL_SUCCESS; - - cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(nbparams_params), &(nbparams_params)); - cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(adat->xq)); - cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec)); - cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->sci)); - cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->cj4)); - cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->imask)); - cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(int), &(numParts)); - cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(int), &(part)); - cl_error |= clSetKernelArg(pruneKernel, arg_no++, shmem, nullptr); - assert(cl_error == CL_SUCCESS); - - cl_error = clEnqueueNDRangeKernel(stream, pruneKernel, 3, - nullptr, global_work_size, local_work_size, - 0, nullptr, bDoTime ? timer->fetchNextEvent() : nullptr); - GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str()); + auto *timingEvent = bDoTime ? timer->fetchNextEvent() : nullptr; + constexpr char kernelName[] = "k_pruneonly"; + const auto kernelArgs = prepareGpuKernelArguments(pruneKernel, config, + &nbparams_params, &adat->xq, &adat->shift_vec, + &plist->sci, &plist->cj4, &plist->imask, &numParts, &part); + launchGpuKernel(pruneKernel, config, timingEvent, kernelName, kernelArgs); if (plist->haveFreshList) {