pme_gpu_copy_input_forces(pmeGpu);
}
- cudaStream_t stream = pmeGpu->archSpecific->pmeStream;
const int order = pmeGpu->common->pme_order;
const auto *kernelParamsPtr = pmeGpu->kernelParams.get();
const int atomsPerBlock = (c_gatherMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM);
GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. gathering block size");
- const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
- auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
- dim3 dimBlock(order, order, atomsPerBlock);
+ const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+ auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
- const bool wrapX = true;
- const bool wrapY = true;
+ KernelLaunchConfig config;
+ config.blockSize[0] = config.blockSize[1] = order;
+ config.blockSize[2] = atomsPerBlock;
+ config.gridSize[0] = dimGrid.x;
+ config.gridSize[1] = dimGrid.y;
+ config.stream = pmeGpu->archSpecific->pmeStream;
+
+ if (order != 4)
+ {
+ GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not implemented!"));
+ }
+
+ constexpr bool wrapX = true;
+ constexpr bool wrapY = true;
GMX_UNUSED_VALUE(wrapX);
GMX_UNUSED_VALUE(wrapY);
// TODO test different cache configs
- pme_gpu_start_timing(pmeGpu, gtPME_GATHER);
- if (order == 4)
- {
- if (forceTreatment == PmeForceOutputHandling::Set)
- {
- pme_gather_kernel<4, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
- }
- else
- {
- pme_gather_kernel<4, false, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
- }
- }
- else
- {
- GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 is not implemented"));
- }
- CU_LAUNCH_ERR("pme_gather_kernel");
- pme_gpu_stop_timing(pmeGpu, gtPME_GATHER);
+ int timingId = gtPME_GATHER;
+ void (*kernelPtr)(const PmeGpuCudaKernelParams) = (forceTreatment == PmeForceOutputHandling::Set) ?
+ pme_gather_kernel<4, true, wrapX, wrapY> :
+ pme_gather_kernel<4, false, wrapX, wrapY>;
+
+ pme_gpu_start_timing(pmeGpu, timingId);
+ auto *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
+ const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+ launchGpuKernel(kernelPtr, config, timingEvent, "PME gather", kernelArgs);
+ pme_gpu_stop_timing(pmeGpu, timingId);
pme_gpu_copy_output_forces(pmeGpu);
}
const int blocksPerGridLine = (gridLineSize + maxBlockSize - 1) / maxBlockSize;
const int cellsPerBlock = gridLineSize * gridLinesPerBlock;
const int blockSize = (cellsPerBlock + warp_size - 1) / warp_size * warp_size;
+
+
+ KernelLaunchConfig config;
+ config.blockSize[0] = blockSize;
+ config.gridSize[0] = blocksPerGridLine;
// rounding up to full warps so that shuffle operations produce defined results
- dim3 threads(blockSize);
- dim3 blocks(blocksPerGridLine,
- (pmeGpu->kernelParams->grid.complexGridSize[middleDim] + gridLinesPerBlock - 1) / gridLinesPerBlock,
- pmeGpu->kernelParams->grid.complexGridSize[majorDim]);
+ config.gridSize[1] = (pmeGpu->kernelParams->grid.complexGridSize[middleDim] + gridLinesPerBlock - 1) / gridLinesPerBlock;
+ config.gridSize[2] = pmeGpu->kernelParams->grid.complexGridSize[majorDim];
+ config.stream = pmeGpu->archSpecific->pmeStream;
- pme_gpu_start_timing(pmeGpu, gtPME_SOLVE);
+ int timingId = gtPME_SOLVE;
+ void (*kernelPtr)(const PmeGpuCudaKernelParams) = nullptr;
if (gridOrdering == GridOrdering::YZX)
{
- if (computeEnergyAndVirial)
- {
- pme_solve_kernel<GridOrdering::YZX, true> <<< blocks, threads, 0, stream>>> (*kernelParamsPtr);
- }
- else
- {
- pme_solve_kernel<GridOrdering::YZX, false> <<< blocks, threads, 0, stream>>> (*kernelParamsPtr);
- }
+ kernelPtr = computeEnergyAndVirial ?
+ pme_solve_kernel<GridOrdering::YZX, true> :
+ pme_solve_kernel<GridOrdering::YZX, false>;
}
else if (gridOrdering == GridOrdering::XYZ)
{
- if (computeEnergyAndVirial)
- {
- pme_solve_kernel<GridOrdering::XYZ, true> <<< blocks, threads, 0, stream>>> (*kernelParamsPtr);
- }
- else
- {
- pme_solve_kernel<GridOrdering::XYZ, false> <<< blocks, threads, 0, stream>>> (*kernelParamsPtr);
- }
+ kernelPtr = computeEnergyAndVirial ?
+ pme_solve_kernel<GridOrdering::XYZ, true> :
+ pme_solve_kernel<GridOrdering::XYZ, false>;
}
- CU_LAUNCH_ERR("pme_solve_kernel");
- pme_gpu_stop_timing(pmeGpu, gtPME_SOLVE);
+
+ pme_gpu_start_timing(pmeGpu, timingId);
+ auto *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
+ const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+ launchGpuKernel(kernelPtr, config, timingEvent, "PME solve", kernelArgs);
+ pme_gpu_stop_timing(pmeGpu, timingId);
if (computeEnergyAndVirial)
{
bool spreadCharges)
{
GMX_ASSERT(computeSplines || spreadCharges, "PME spline/spread kernel has invalid input (nothing to do)");
- cudaStream_t stream = pmeGpu->archSpecific->pmeStream;
const auto *kernelParamsPtr = pmeGpu->kernelParams.get();
GMX_ASSERT(kernelParamsPtr->atoms.nAtoms > 0, "No atom data in PME GPU spread");
//(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
- const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
- auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
- dim3 dimBlock(order, order, atomsPerBlock);
+ const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+ auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
+
+ KernelLaunchConfig config;
+ config.blockSize[0] = config.blockSize[1] = order;
+ config.blockSize[2] = atomsPerBlock;
+ config.gridSize[0] = dimGrid.x;
+ config.gridSize[1] = dimGrid.y;
+ config.stream = pmeGpu->archSpecific->pmeStream;
+
+ if (order != 4)
+ {
+ GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not implemented!"));
+ }
// These should later check for PME decomposition
- const bool wrapX = true;
- const bool wrapY = true;
+ constexpr bool wrapX = true;
+ constexpr bool wrapY = true;
GMX_UNUSED_VALUE(wrapX);
GMX_UNUSED_VALUE(wrapY);
- switch (order)
+
+ int timingId;
+ void (*kernelPtr)(const PmeGpuCudaKernelParams) = nullptr;
+ if (computeSplines)
{
- case 4:
+ if (spreadCharges)
{
- // TODO: cleaner unroll with some template trick?
- if (computeSplines)
- {
- if (spreadCharges)
- {
- pme_gpu_start_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
- pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
- CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
- pme_gpu_stop_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
- }
- else
- {
- pme_gpu_start_timing(pmeGpu, gtPME_SPLINE);
- pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
- CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
- pme_gpu_stop_timing(pmeGpu, gtPME_SPLINE);
- }
- }
- else
- {
- pme_gpu_start_timing(pmeGpu, gtPME_SPREAD);
- pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
- CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
- pme_gpu_stop_timing(pmeGpu, gtPME_SPREAD);
- }
+ timingId = gtPME_SPLINEANDSPREAD;
+ kernelPtr = pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY>;
+ }
+ else
+ {
+ timingId = gtPME_SPLINE;
+ kernelPtr = pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY>;
}
- break;
-
- default:
- GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not tested!"));
}
+ else
+ {
+ timingId = gtPME_SPREAD;
+ kernelPtr = pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY>;
+ }
+
+ pme_gpu_start_timing(pmeGpu, timingId);
+ auto *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
+ const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+ launchGpuKernel(kernelPtr, config, timingEvent, "PME spline/spread", kernelArgs);
+ pme_gpu_stop_timing(pmeGpu, timingId);
const bool copyBackGrid = spreadCharges && (pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_FFT(pmeGpu));
if (copyBackGrid)
}
}
+CommandEvent *pme_gpu_fetch_timing_event(const PmeGpu *pmeGpu, size_t PMEStageId)
+{
+ CommandEvent *timingEvent = nullptr;
+ if (pme_gpu_timings_enabled(pmeGpu))
+ {
+ GMX_ASSERT(PMEStageId < pmeGpu->archSpecific->timingEvents.size(), "Wrong PME GPU timing event index");
+ timingEvent = pmeGpu->archSpecific->timingEvents[PMEStageId].fetchNextEvent();
+ }
+ return timingEvent;
+}
+
void pme_gpu_stop_timing(const PmeGpu *pmeGpu, size_t PMEStageId)
{
if (pme_gpu_timings_enabled(pmeGpu))
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
*/
void pme_gpu_start_timing(const PmeGpu *pmeGpu, size_t PMEStageId);
+/*! \libinternal \brief
+ * Returns raw timing event from the corresponding GpuRegionTimer (if timings are enabled).
+ * In CUDA result can be nullptr stub, per GpuRegionTimer implementation.
+ *
+ * \param[in] pmeGpu The PME GPU data structure.
+ * \param[in] PMEStageId The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h
+ */
+CommandEvent *pme_gpu_fetch_timing_event(const PmeGpu *pmeGpu, size_t PMEStageId);
+
/*! \libinternal \brief
* Stops timing the certain PME GPU stage during a single computation (if timings are enabled).
*
#include <nvml.h>
#endif /* HAVE_NVML */
+#include <array>
#include <string>
+#include "gromacs/gpu_utils/gputraits.cuh"
#include "gromacs/math/vec.h"
#include "gromacs/math/vectypes.h"
+#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/stringutil.h"
} \
} while (0)
-/*! Check for any previously occurred uncaught CUDA error
- -- aimed at use after kernel calls. */
-#define CU_LAUNCH_ERR(msg) \
- do { \
- cudaError_t _CU_LAUNCH_ERR_status = cudaGetLastError(); \
- if (_CU_LAUNCH_ERR_status != cudaSuccess) { \
- gmx_fatal(FARGS, "Error while launching kernel %s: %s\n", msg, cudaGetErrorString(_CU_LAUNCH_ERR_status)); \
- } \
- } while (0)
-
-/*! Synchronize with GPU and check for any previously occurred uncaught CUDA error
- -- aimed at use after kernel calls. */
-#define CU_LAUNCH_ERR_SYNC(msg) \
- do { \
- cudaError_t _CU_SYNC_LAUNCH_ERR_status = cudaThreadSynchronize(); \
- if (_CU_SYNC_LAUNCH_ERR_status != cudaSuccess) { \
- gmx_fatal(FARGS, "Error while launching kernel %s: %s\n", msg, cudaGetErrorString(_CU_SYNC_LAUNCH_ERR_status)); \
- } \
- } while (0)
-
#else /* CHECK_CUDA_ERRORS */
#define CU_RET_ERR(status, msg) do { } while (0)
#define CU_CHECK_PREV_ERR() do { } while (0)
-#define CU_LAUNCH_ERR(msg) do { } while (0)
-#define CU_LAUNCH_ERR_SYNC(msg) do { } while (0)
#define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
#endif /* CHECK_CUDA_ERRORS */
return true;
}
+/* Kernel launch helpers */
+
+/*! \brief
+ * A function for setting up a single CUDA kernel argument.
+ * This is the tail of the compile-time recursive function below.
+ * It has to be seen by the compiler first.
+ *
+ * \tparam totalArgsCount Number of the kernel arguments
+ * \tparam KernelPtr Kernel function handle type
+ * \param[in] argIndex Index of the current argument
+ */
+template <size_t totalArgsCount, typename KernelPtr>
+void prepareGpuKernelArgument(KernelPtr /*kernel*/,
+ std::array<void *, totalArgsCount> */* kernelArgsPtr */,
+ size_t gmx_used_in_debug argIndex)
+{
+ GMX_ASSERT(argIndex == totalArgsCount, "Tail expansion");
+}
+
+/*! \brief
+ * Compile-time recursive function for setting up a single CUDA kernel argument.
+ * This function copies a kernel argument pointer \p argPtr into \p kernelArgsPtr,
+ * and calls itself on the next argument, eventually calling the tail function above.
+ *
+ * \tparam CurrentArg Type of the current argument
+ * \tparam RemainingArgs Types of remaining arguments after the current one
+ * \tparam totalArgsCount Number of the kernel arguments
+ * \tparam KernelPtr Kernel function handle type
+ * \param[in] kernel Kernel function handle
+ * \param[in,out] kernelArgsPtr Pointer to the argument array to be filled in
+ * \param[in] argIndex Index of the current argument
+ * \param[in] argPtr Pointer to the current argument
+ * \param[in] otherArgsPtrs Pack of pointers to arguments remaining to process after the current one
+ */
+template <typename CurrentArg, typename ... RemainingArgs, size_t totalArgsCount, typename KernelPtr>
+void prepareGpuKernelArgument(KernelPtr kernel,
+ std::array<void *, totalArgsCount> *kernelArgsPtr,
+ size_t argIndex,
+ const CurrentArg *argPtr,
+ const RemainingArgs *... otherArgsPtrs)
+{
+ (*kernelArgsPtr)[argIndex] = (void *)argPtr;
+ prepareGpuKernelArgument(kernel, kernelArgsPtr, argIndex + 1, otherArgsPtrs ...);
+}
+
+/*! \brief
+ * A wrapper function for setting up all the CUDA kernel arguments.
+ * Calls the recursive functions above.
+ *
+ * \tparam Args Types of all the kernel arguments
+ * \param[in] kernel Kernel function handle
+ * \param[in] argsPtrs Pointers to all the kernel arguments
+ * \returns A prepared parameter pack to be used with launchGpuKernel() as the last argument.
+ */
+template <typename ... Args>
+std::array<void *, sizeof ... (Args)> prepareGpuKernelArguments(void (*kernel)(Args...),
+ const KernelLaunchConfig & /*config */,
+ const Args *... argsPtrs)
+{
+ std::array<void *, sizeof ... (Args)> kernelArgs;
+ prepareGpuKernelArgument(kernel, &kernelArgs, 0, argsPtrs ...);
+ return kernelArgs;
+}
+
+/*! \brief Launches the CUDA kernel and handles the errors.
+ *
+ * \tparam Args Types of all the kernel arguments
+ * \param[in] kernel Kernel function handle
+ * \param[in] config Kernel configuration for launching
+ * \param[in] kernelName Human readable kernel description, for error handling only
+ * \param[in] kernelArgs Array of the pointers to the kernel arguments, prepared by prepareGpuKernelArguments()
+ * \throws gmx::InternalError on kernel launch failure
+ */
+template <typename... Args>
+void launchGpuKernel(void (*kernel)(Args...),
+ const KernelLaunchConfig &config,
+ CommandEvent */*timingEvent */,
+ const char *kernelName,
+ const std::array<void *, sizeof ... (Args)> &kernelArgs)
+{
+ dim3 blockSize(config.blockSize[0], config.blockSize[1], config.blockSize[2]);
+ dim3 gridSize(config.gridSize[0], config.gridSize[1], config.gridSize[2]);
+ cudaLaunchKernel((void *)kernel, gridSize, blockSize, const_cast<void **>(kernelArgs.data()), config.sharedMemorySize, config.stream);
+
+ cudaError_t status = cudaGetLastError();
+ if (cudaSuccess != status)
+ {
+ const std::string errorMessage = "GPU kernel (" + std::string(kernelName) +
+ ") failed to launch: " + std::string(cudaGetErrorString(status));
+ GMX_THROW(gmx::InternalError(errorMessage));
+ }
+}
+
#endif
}
/* try to execute a dummy kernel */
- k_dummy_test<<< 1, 512>>> ();
+ KernelLaunchConfig config;
+ config.blockSize[0] = 512;
+ const auto dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
+ launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments);
if (cudaThreadSynchronize() != cudaSuccess)
{
return -1;
//! \brief Context used explicitly in OpenCL, does nothing in CUDA
using Context = void *;
+/*! \internal \brief
+ * GPU kernels scheduling description. This is same in OpenCL/CUDA.
+ * Provides reasonable defaults, one typically only needs to set the GPU stream
+ * and non-1 work sizes.
+ */
+struct KernelLaunchConfig
+{
+ size_t gridSize[3] = {1, 1, 1}; //!< Block counts
+ size_t blockSize[3] = {1, 1, 1}; //!< Per-block thread counts
+ size_t sharedMemorySize = 0; //!< Shared memory size in bytes
+ CommandStream stream = nullptr; //!< Stream to launch kernel in
+};
+
#endif
//! \brief Context used explicitly in OpenCL
using Context = cl_context;
+/*! \internal \brief
+ * GPU kernels scheduling description. This is same in OpenCL/CUDA.
+ * Provides reasonable defaults, one typically only needs to set the GPU stream
+ * and non-1 work sizes.
+ */
+struct KernelLaunchConfig
+{
+ size_t gridSize[3] = {1, 1, 1}; //!< Work groups (CUDA blocks) counts
+ size_t blockSize[3] = {1, 1, 1}; //!< Per work group (CUDA block) thread counts
+ size_t sharedMemorySize = 0; //!< Shared memory size in bytes
+ CommandStream stream = nullptr; //!< Stream to launch kernel in
+};
+
#endif
#include <string>
#include "gromacs/gpu_utils/gmxopencl.h"
+#include "gromacs/gpu_utils/gputraits_ocl.h"
+#include "gromacs/utility/exceptions.h"
#include "gromacs/utility/gmxassert.h"
enum class GpuApiCallBehavior;
return false;
}
+/* Kernel launch helpers */
+
+/*! \brief
+ * A function for setting up a single OpenCL kernel argument.
+ * This is the tail of the compile-time recursive function below.
+ * It has to be seen by the compiler first.
+ * As NB kernels might be using dynamic local memory as the last argument,
+ * this function also manages that, using sharedMemorySize from \p config.
+ *
+ * \param[in] kernel Kernel function handle
+ * \param[in] config Kernel configuration for launching
+ * \param[in] argIndex Index of the current argument
+ */
+void inline prepareGpuKernelArgument(cl_kernel kernel,
+ const KernelLaunchConfig &config,
+ size_t argIndex)
+{
+ if (config.sharedMemorySize > 0)
+ {
+ cl_int gmx_used_in_debug clError = clSetKernelArg(kernel, argIndex, config.sharedMemorySize, nullptr);
+ GMX_ASSERT(CL_SUCCESS == clError, ocl_get_error_string(clError).c_str());
+ }
+}
+
+/*! \brief
+ * Compile-time recursive function for setting up a single OpenCL kernel argument.
+ * This function uses one kernel argument pointer \p argPtr to call clSetKernelArg(),
+ * and calls itself on the next argument, eventually calling the tail function above.
+ *
+ * \tparam CurrentArg Type of the current argument
+ * \tparam RemainingArgs Types of remaining arguments after the current one
+ * \param[in] kernel Kernel function handle
+ * \param[in] config Kernel configuration for launching
+ * \param[in] argIndex Index of the current argument
+ * \param[in] argPtr Pointer to the current argument
+ * \param[in] otherArgsPtrs Pack of pointers to arguments remaining to process after the current one
+ */
+template <typename CurrentArg, typename ... RemainingArgs>
+void prepareGpuKernelArgument(cl_kernel kernel,
+ const KernelLaunchConfig &config,
+ size_t argIndex,
+ const CurrentArg *argPtr,
+ const RemainingArgs *... otherArgsPtrs)
+{
+ cl_int gmx_used_in_debug clError = clSetKernelArg(kernel, argIndex, sizeof(CurrentArg), argPtr);
+ GMX_ASSERT(CL_SUCCESS == clError, ocl_get_error_string(clError).c_str());
+
+ prepareGpuKernelArgument(kernel, config, argIndex + 1, otherArgsPtrs ...);
+}
+
+/*! \brief
+ * A wrapper function for setting up all the OpenCL kernel arguments.
+ * Calls the recursive functions above.
+ *
+ * \tparam Args Types of all the kernel arguments
+ * \param[in] kernel Kernel function handle
+ * \param[in] config Kernel configuration for launching
+ * \param[in] argsPtrs Pointers to all the kernel arguments
+ * \returns A handle for the prepared parameter pack to be used with launchGpuKernel() as the last argument
+ * - currently always nullptr for OpenCL, as it manages kernel/arguments association by itself.
+ */
+template <typename ... Args>
+void *prepareGpuKernelArguments(cl_kernel kernel,
+ const KernelLaunchConfig &config,
+ const Args *... argsPtrs)
+{
+ prepareGpuKernelArgument(kernel, config, 0, argsPtrs ...);
+ return nullptr;
+}
+
+/*! \brief Launches the OpenCL kernel and handles the errors.
+ *
+ * \param[in] kernel Kernel function handle
+ * \param[in] config Kernel configuration for launching
+ * \param[in] timingEvent Timing event, fetched from GpuRegionTimer
+ * \param[in] kernelName Human readable kernel description, for error handling only
+ * \throws gmx::InternalError on kernel launch failure
+ */
+inline void launchGpuKernel(cl_kernel kernel,
+ const KernelLaunchConfig &config,
+ CommandEvent *timingEvent,
+ const char *kernelName,
+ const void * /*kernelArgs*/)
+{
+ const int workDimensions = 3;
+ const size_t *globalWorkOffset = nullptr;
+ const size_t waitListSize = 0;
+ const cl_event *waitList = nullptr;
+ size_t globalWorkSize[3];
+ for (int i = 0; i < workDimensions; i++)
+ {
+ globalWorkSize[i] = config.gridSize[i] * config.blockSize[i];
+ }
+ cl_int clError = clEnqueueNDRangeKernel(config.stream, kernel, workDimensions, globalWorkOffset,
+ globalWorkSize, config.blockSize, waitListSize, waitList, timingEvent);
+ if (CL_SUCCESS != clError)
+ {
+ const std::string errorMessage = "GPU kernel (" + std::string(kernelName) +
+ ") failed to launch: " + ocl_get_error_string(clError);
+ GMX_THROW(gmx::InternalError(errorMessage));
+ }
+}
+
#endif
cudaError_t stat;
int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
/* CUDA kernel launch-related stuff */
- int shmem, nblock;
+ int nblock;
dim3 dim_block, dim_grid;
nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
num_threads_z = 2;
}
nblock = calc_nb_kernel_nblock(plist->nsci, nb->dev_info);
- dim_block = dim3(c_clSize, c_clSize, num_threads_z);
- dim_grid = dim3(nblock, 1, 1);
- shmem = calc_shmem_required_nonbonded(num_threads_z, nb->dev_info, nbp);
+
+ KernelLaunchConfig config;
+ config.blockSize[0] = c_clSize;
+ config.blockSize[1] = c_clSize;
+ config.blockSize[2] = num_threads_z;
+ config.gridSize[0] = nblock;
+ config.sharedMemorySize = calc_shmem_required_nonbonded(num_threads_z, nb->dev_info, nbp);
+ config.stream = stream;
if (debug)
{
- fprintf(debug, "Non-bonded GPU launch configuration:\n\tThread block: %ux%ux%u\n\t"
- "\tGrid: %ux%u\n\t#Super-clusters/clusters: %d/%d (%d)\n"
- "\tShMem: %d\n",
- dim_block.x, dim_block.y, dim_block.z,
- dim_grid.x, dim_grid.y, plist->nsci*c_numClPerSupercl,
+ fprintf(debug, "Non-bonded GPU launch configuration:\n\tThread block: %zux%zux%zu\n\t"
+ "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
+ "\tShMem: %zu\n",
+ config.blockSize[0], config.blockSize[1], config.blockSize[2],
+ config.gridSize[0], config.gridSize[1], plist->nsci*c_numClPerSupercl,
c_numClPerSupercl, plist->na_c,
- shmem);
+ config.sharedMemorySize);
}
- void* kernel_args[4];
- kernel_args[0] = adat;
- kernel_args[1] = nbp;
- kernel_args[2] = plist;
- kernel_args[3] = &bCalcFshift;
-
- cudaLaunchKernel((void *)nb_kernel, dim_grid, dim_block, kernel_args, shmem, stream);
- CU_LAUNCH_ERR("k_calc_nb");
+ auto *timingEvent = bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr;
+ const auto kernelArgs = prepareGpuKernelArguments(nb_kernel, config, adat, nbp, plist, &bCalcFshift);
+ launchGpuKernel(nb_kernel, config, timingEvent, "k_calc_nb", kernelArgs);
if (bDoTime)
{
* and j-cluster concurrency, in x, y, and z, respectively.
* - The 1D block-grid contains as many blocks as super-clusters.
*/
- int num_threads_z = c_cudaPruneKernelJ4Concurrency;
- int nblock = calc_nb_kernel_nblock(numSciInPart, nb->dev_info);
- dim3 dim_block = dim3(c_clSize, c_clSize, num_threads_z);
- dim3 dim_grid = dim3(nblock, 1, 1);
- int shmem = calc_shmem_required_prune(num_threads_z);
+ int num_threads_z = c_cudaPruneKernelJ4Concurrency;
+ int nblock = calc_nb_kernel_nblock(numSciInPart, nb->dev_info);
+ KernelLaunchConfig config;
+ config.blockSize[0] = c_clSize;
+ config.blockSize[1] = c_clSize;
+ config.blockSize[2] = num_threads_z;
+ config.gridSize[0] = nblock;
+ config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
+ config.stream = stream;
if (debug)
{
- fprintf(debug, "Pruning GPU kernel launch configuration:\n\tThread block: %ux%ux%u\n\t"
- "\tGrid: %ux%u\n\t#Super-clusters/clusters: %d/%d (%d)\n"
- "\tShMem: %d\n",
- dim_block.x, dim_block.y, dim_block.z,
- dim_grid.x, dim_grid.y, numSciInPart*c_numClPerSupercl,
+ fprintf(debug, "Pruning GPU kernel launch configuration:\n\tThread block: %zux%zux%zu\n\t"
+ "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
+ "\tShMem: %zu\n",
+ config.blockSize[0], config.blockSize[1], config.blockSize[2],
+ config.gridSize[0], config.gridSize[1], numSciInPart*c_numClPerSupercl,
c_numClPerSupercl, plist->na_c,
- shmem);
+ config.sharedMemorySize);
}
- void* kernel_args[5];
- kernel_args[0] = adat;
- kernel_args[1] = nbp;
- kernel_args[2] = plist;
- kernel_args[3] = &numParts;
- kernel_args[4] = ∂
-
- if (plist->haveFreshList)
- {
- cudaLaunchKernel((void *)nbnxn_kernel_prune_cuda<true>, dim_grid, dim_block, kernel_args, shmem, stream);
- }
- else
- {
- cudaLaunchKernel((void *)nbnxn_kernel_prune_cuda<false>, dim_grid, dim_block, kernel_args, shmem, stream);
- }
- CU_LAUNCH_ERR("k_pruneonly");
+ auto *timingEvent = bDoTime ? timer->fetchNextEvent() : nullptr;
+ constexpr char kernelName[] = "k_pruneonly";
+ const auto &kernel = plist->haveFreshList ? nbnxn_kernel_prune_cuda<true> : nbnxn_kernel_prune_cuda<false>;
+ const auto kernelArgs = prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &numParts, &part);
+ launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
/* TODO: consider a more elegant way to track which kernel has been called
(combined or separate 1st pass prune, rolling prune). */
/*! \brief Validates the input global work size parameter.
*/
-static inline void validate_global_work_size(size_t *global_work_size, int work_dim, const gmx_device_info_t *dinfo)
+static inline void validate_global_work_size(const KernelLaunchConfig &config, int work_dim, const gmx_device_info_t *dinfo)
{
cl_uint device_size_t_size_bits;
cl_uint host_size_t_size_bits;
assert(dinfo);
+ size_t global_work_size[3];
+ GMX_ASSERT(work_dim <= 3, "Not supporting hyper-grids just yet");
+ for (int i = 0; i < work_dim; i++)
+ {
+ global_work_size[i] = config.blockSize[i] * config.gridSize[i];
+ }
+
/* Each component of a global_work_size must not exceed the range given by the
sizeof(device size_t) for the device on which the kernel execution will
be enqueued. See:
int flags,
int iloc)
{
- cl_int cl_error;
int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
/* OpenCL kernel launch-related stuff */
- int shmem;
- size_t local_work_size[3], global_work_size[3];
- cl_kernel nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
+ cl_kernel nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
cl_atomdata_t *adat = nb->atdat;
cl_nbparam_t *nbp = nb->nbparam;
bool bCalcEner = flags & GMX_FORCE_ENERGY;
int bCalcFshift = flags & GMX_FORCE_VIRIAL;
bool bDoTime = nb->bDoTime;
- cl_uint arg_no;
cl_nbparam_params_t nbparams_params;
{
if (iloc == eintLocal)
{
- cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done));
+ cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done));
assert(CL_SUCCESS == cl_error);
/* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
(plist->haveFreshList && !nb->timers->didPrune[iloc]));
/* kernel launch config */
- local_work_size[0] = c_clSize;
- local_work_size[1] = c_clSize;
- local_work_size[2] = 1;
-
- global_work_size[0] = plist->nsci * local_work_size[0];
- global_work_size[1] = 1 * local_work_size[1];
- global_work_size[2] = 1 * local_work_size[2];
- validate_global_work_size(global_work_size, 3, nb->dev_info);
+ KernelLaunchConfig config;
+ config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
+ config.stream = stream;
+ config.blockSize[0] = c_clSize;
+ config.blockSize[1] = c_clSize;
+ config.gridSize[0] = plist->nsci;
- shmem = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
+ validate_global_work_size(config, 3, nb->dev_info);
if (debug)
{
fprintf(debug, "Non-bonded GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
"Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
- (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
- (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*c_numClPerSupercl,
+ (int)(config.blockSize[0]), (int)(config.blockSize[1]), (int)(config.blockSize[2]),
+ (int)(config.blockSize[0] * config.gridSize[0]), (int)(config.blockSize[1] * config.gridSize[1]), plist->nsci*c_numClPerSupercl,
c_numClPerSupercl, plist->na_c);
}
fillin_ocl_structures(nbp, &nbparams_params);
- arg_no = 0;
- cl_error = CL_SUCCESS;
- if (!useLjCombRule(nb->nbparam->vdwtype))
- {
- cl_error = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes));
- }
- cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
- cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq));
- cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f));
- cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj));
- cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el));
- cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift));
+ auto *timingEvent = bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr;
+ constexpr char kernelName[] = "k_calc_nb";
if (useLjCombRule(nb->nbparam->vdwtype))
{
- cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->lj_comb));
+ const auto kernelArgs = prepareGpuKernelArguments(nb_kernel, config,
+ &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift,
+ &adat->lj_comb,
+ &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d,
+ &plist->sci, &plist->cj4, &plist->excl, &bCalcFshift);
+
+ launchGpuKernel(nb_kernel, config, timingEvent, kernelName, kernelArgs);
}
else
{
- cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types));
+ const auto kernelArgs = prepareGpuKernelArguments(nb_kernel, config,
+ &adat->ntypes,
+ &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift,
+ &adat->atom_types,
+ &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d,
+ &plist->sci, &plist->cj4, &plist->excl, &bCalcFshift);
+ launchGpuKernel(nb_kernel, config, timingEvent, kernelName, kernelArgs);
}
- cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
- cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d));
- cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d));
- cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d));
- cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci));
- cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
- cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl));
- cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift);
- cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL);
-
- assert(cl_error == CL_SUCCESS);
-
- if (cl_error)
- {
- printf("OpenCL error: %s\n", ocl_get_error_string(cl_error).c_str());
- }
- cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr);
- assert(cl_error == CL_SUCCESS);
if (bDoTime)
{
int iloc,
int numParts)
{
- cl_int cl_error;
-
cl_atomdata_t *adat = nb->atdat;
cl_nbparam_t *nbp = nb->nbparam;
cl_plist_t *plist = nb->plist[iloc];
cl_kernel pruneKernel = selectPruneKernel(nb->kernel_pruneonly, plist->haveFreshList);
/* kernel launch config */
- size_t local_work_size[3], global_work_size[3];
- local_work_size[0] = c_clSize;
- local_work_size[1] = c_clSize;
- local_work_size[2] = num_threads_z;
+ KernelLaunchConfig config;
+ config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
+ config.stream = stream;
+ config.blockSize[0] = c_clSize;
+ config.blockSize[1] = c_clSize;
+ config.blockSize[2] = num_threads_z;
+ config.gridSize[0] = numSciInPart;
- global_work_size[0] = numSciInPart * local_work_size[0];
- global_work_size[1] = 1 * local_work_size[1];
- global_work_size[2] = 1 * local_work_size[2];
-
- validate_global_work_size(global_work_size, 3, nb->dev_info);
-
- int shmem = calc_shmem_required_prune(num_threads_z);
+ validate_global_work_size(config, 3, nb->dev_info);
if (debug)
{
fprintf(debug, "Pruning GPU kernel launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
"\tGlobal work size: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n"
- "\tShMem: %d\n",
- (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
- (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*c_numClPerSupercl,
- c_numClPerSupercl, plist->na_c, shmem);
+ "\tShMem: %zu\n",
+ (int)(config.blockSize[0]), (int)(config.blockSize[1]), (int)(config.blockSize[2]),
+ (int)(config.blockSize[0] * config.gridSize[0]), (int)(config.blockSize[1] * config.gridSize[1]), plist->nsci*c_numClPerSupercl,
+ c_numClPerSupercl, plist->na_c, config.sharedMemorySize);
}
cl_nbparam_params_t nbparams_params;
fillin_ocl_structures(nbp, &nbparams_params);
- cl_uint arg_no = 0;
- cl_error = CL_SUCCESS;
-
- cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
- cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(adat->xq));
- cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
- cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->sci));
- cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
- cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->imask));
- cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(int), &(numParts));
- cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(int), &(part));
- cl_error |= clSetKernelArg(pruneKernel, arg_no++, shmem, nullptr);
- assert(cl_error == CL_SUCCESS);
-
- cl_error = clEnqueueNDRangeKernel(stream, pruneKernel, 3,
- nullptr, global_work_size, local_work_size,
- 0, nullptr, bDoTime ? timer->fetchNextEvent() : nullptr);
- GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
+ auto *timingEvent = bDoTime ? timer->fetchNextEvent() : nullptr;
+ constexpr char kernelName[] = "k_pruneonly";
+ const auto kernelArgs = prepareGpuKernelArguments(pruneKernel, config,
+ &nbparams_params, &adat->xq, &adat->shift_vec,
+ &plist->sci, &plist->cj4, &plist->imask, &numParts, &part);
+ launchGpuKernel(pruneKernel, config, timingEvent, kernelName, kernelArgs);
if (plist->haveFreshList)
{