gmx_add_libgromacs_sources(
# OpenCL-specific sources
pme-gpu-3dfft-ocl.cpp
+ pme-gpu-program-impl-ocl.cpp
# GPU-specific sources
pme-gpu.cpp
pme-gpu-internal.cpp
pme-gpu-timings.cpp
- # Files that implement stubs
- pme-gpu-program-impl.cpp
)
else()
gmx_add_libgromacs_sources(
const int atomsPerBlock = (c_gatherMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM);
const int atomDataSize = PME_SPREADGATHER_THREADS_PER_ATOM; /* Number of data components and threads for a single atom */
const int blockSize = atomsPerBlock * atomDataSize;
+ const int atomsPerWarp = PME_SPREADGATHER_ATOMS_PER_WARP;
const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
const int pny = kernelParams.grid.realGridSizePadded[YY];
const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
- const int atomWarpIndex = atomIndexLocal % PME_SPREADGATHER_ATOMS_PER_WARP;
- const int warpIndex = atomIndexLocal / PME_SPREADGATHER_ATOMS_PER_WARP;
+ const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
+ const int warpIndex = atomIndexLocal / atomsPerWarp;
- const int splineIndexBase = getSplineParamIndexBase<order>(warpIndex, atomWarpIndex);
- const int splineIndexY = getSplineParamIndex<order>(splineIndexBase, YY, ithy);
+ const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
+ const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
const float2 tdy = sm_splineParams[splineIndexY];
- const int splineIndexZ = getSplineParamIndex<order>(splineIndexBase, ZZ, ithz);
+ const int splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
const float2 tdz = sm_splineParams[splineIndexZ];
const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
assert(gridIndexGlobal >= 0);
const float gridValue = gm_grid[gridIndexGlobal];
assert(isfinite(gridValue));
- const int splineIndexX = getSplineParamIndex<order>(splineIndexBase, XX, ithx);
+ const int splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
const float2 tdx = sm_splineParams[splineIndexX];
const float fxy1 = tdz.x * gridValue;
const float fz1 = tdz.y * gridValue;
#include "config.h"
#if GMX_GPU == GMX_GPU_CUDA
-#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
-#else
-#define warp_size 32 // FIXME remove this and rework macros
-#define PME_SPREADGATHER_ATOMS_PER_WARP 2
+#include "gromacs/gpu_utils/cuda_arch_utils.cuh" // for warp_size
#endif
/* General settings for PME GPU behaviour */
/*! \brief
* The number of GPU threads used for computing spread/gather contributions of a single atom as function of the PME order.
* The assumption is currently that any thread processes only a single atom's contributions.
+ * TODO: this assumption leads to minimum execution width of 16. See Redmine #2516
*/
#define PME_SPREADGATHER_THREADS_PER_ATOM (order * order)
/*! \brief
* Atom data alignment (in terms of number of atoms).
- * The value is (16 * PME_SPREADGATHER_ATOMS_PER_WARP).
+ * This is the least common multiple of number of atoms processed by
+ * a single block/workgroup of the spread and gather kernels.
* If the GPU atom data buffers are padded (c_usePadding == true),
- * Then the numbers of atoms which would fit in the padded GPU buffers has to be divisible by this.
- * The literal number (16) expresses maximum spread/gather block width in warps.
- * Accordingly, spread and gather block widths in warps should be divisors of this
- * (e.g. in the pme-spread.cu: constexpr int c_spreadMaxThreadsPerBlock = 8 * warp_size;).
- * There are debug asserts for this divisibility.
+ * Then the numbers of atoms which would fit in the padded GPU buffers have to be divisible by this.
+ * There are debug asserts for this divisibility in pme_gpu_spread() and pme_gpu_gather().
*/
#define PME_ATOM_DATA_ALIGNMENT 32
/*
* The execution widths for PME GPU kernels, used both on host and device for correct scheduling.
- * TODO: adjust those for OpenCL.
+ * TODO: those were tuned for CUDA with assumption of warp size 32; specialize those for OpenCL
+ * (Redmine #2528).
+ * As noted below, these are very approximate maximum sizes; in run time we might have to use
+ * smaller block/workgroup sizes, depending on device capabilities.
*/
+//! Spreading max block width in warps picked among powers of 2 (2, 4, 8, 16) for max. occupancy and min. runtime in most cases
+constexpr int c_spreadMaxWarpsPerBlock = 8;
+
+//! Solving kernel max block width in warps picked among powers of 2 (2, 4, 8, 16) for max. occupancy and min. runtime
+//! (560Ti (CC2.1), 660Ti (CC3.0) and 750 (CC5.0)))
+constexpr int c_solveMaxWarpsPerBlock = 8;
+
+//! Gathering max block width in warps - picked empirically among 2, 4, 8, 16 for max. occupancy and min. runtime
+constexpr int c_gatherMaxWarpsPerBlock = 4;
+
+
#if GMX_GPU == GMX_GPU_CUDA
+/* All the guys below are dependent on warp_size and should ideally be removed from the host-side code,
+ * as we have to do that for OpenCL already.
+ * They also express maximum desired block/workgroup sizes, while both with CUDA and OpenCL we have to treat
+ * the device runtime limitations gracefully as well.
+ */
+
/*! \brief
* The number of atoms processed by a single warp in spread/gather.
- * This macro depends on the templated order parameter (2 atoms per warp for order 4).
+ * This macro depends on the templated order parameter (2 atoms per warp for order 4 and warp_size of 32).
* It is mostly used for spline data layout tweaked for coalesced access.
*/
#define PME_SPREADGATHER_ATOMS_PER_WARP (warp_size / PME_SPREADGATHER_THREADS_PER_ATOM)
-//! Spreading max block width in warps picked among powers of 2 (2, 4, 8, 16) for max. occupancy and min. runtime in most cases
-constexpr int c_spreadMaxWarpsPerBlock = 8;
-/* TODO: it has been observed that the kernel can be faster with smaller block sizes (2 or 4 warps)
- * only on some GPUs (660Ti) with large enough grid (>= 48^3) due to load/store units being overloaded
- * (ldst_fu_utilization metric maxed out in nvprof). Runtime block size choice might be nice to have.
- * This has been tried on architectures up to Maxwell (GTX 750) and it would be good to revisit this.
- */
//! Spreading max block size in threads
constexpr int c_spreadMaxThreadsPerBlock = c_spreadMaxWarpsPerBlock * warp_size;
-//! Solving kernel max block width in warps picked among powers of 2 (2, 4, 8, 16) for max. occupancy and min. runtime
-//! (560Ti (CC2.1), 660Ti (CC3.0) and 750 (CC5.0)))
-constexpr int c_solveMaxWarpsPerBlock = 8;
//! Solving kernel max block size in threads
constexpr int c_solveMaxThreadsPerBlock = (c_solveMaxWarpsPerBlock * warp_size);
-//! Gathering max block width in warps - picked empirically among 2, 4, 8, 16 for max. occupancy and min. runtime
-constexpr int c_gatherMaxWarpsPerBlock = 4;
//! Gathering max block size in threads
constexpr int c_gatherMaxThreadsPerBlock = c_gatherMaxWarpsPerBlock * warp_size;
//! Gathering min blocks per CUDA multiprocessor - for CC2.x, we just take the CUDA limit of 8 to avoid the warning
int pme_gpu_get_atoms_per_warp(const PmeGpu *pmeGpu)
{
-#if GMX_GPU == GMX_GPU_CUDA
const int order = pmeGpu->common->pme_order;
GMX_ASSERT(order > 0, "Invalid PME order");
- return PME_SPREADGATHER_ATOMS_PER_WARP;
-#else
- GMX_THROW(gmx::NotImplementedError("Atom alignment per warp has to be deduced dynamically for OpenCL"));
- GMX_UNUSED_VALUE(pmeGpu);
-#endif
+ return pmeGpu->programHandle_->impl_->warpSize / PME_SPREADGATHER_THREADS_PER_ATOM;
}
void pme_gpu_synchronize(const PmeGpu *pmeGpu)
pmeGpu->archSpecific->fftSetup.resize(0);
}
-int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int warpIndex, int atomWarpIndex)
+int getSplineParamFullIndex(int order, int splineIndex, int dimIndex, int atomIndex, int atomsPerWarp)
{
if (order != 4)
{
}
constexpr int fixedOrder = 4;
GMX_UNUSED_VALUE(fixedOrder);
- const int indexBase = getSplineParamIndexBase<fixedOrder>(warpIndex, atomWarpIndex);
- return getSplineParamIndex<fixedOrder>(indexBase, dimIndex, splineIndex);
+
+ const int atomWarpIndex = atomIndex % atomsPerWarp;
+ const int warpIndex = atomIndex / atomsPerWarp;
+ int indexBase, result;
+ switch (atomsPerWarp)
+ {
+ case 2:
+ indexBase = getSplineParamIndexBase<fixedOrder, 2>(warpIndex, atomWarpIndex);
+ result = getSplineParamIndex<fixedOrder, 2>(indexBase, dimIndex, splineIndex);
+ break;
+
+ default:
+ GMX_THROW(gmx::NotImplementedError(gmx::formatString("Test function call not unrolled for atomsPerWarp = %d in getSplineParamFullIndex", atomsPerWarp)));
+ }
+ return result;
}
gmx::ArrayRef<gmx::RVec> pme_gpu_get_forces(PmeGpu *pmeGpu)
for (auto atomIndex = 0; atomIndex < atomCount; atomIndex++)
{
- auto atomWarpIndex = atomIndex % atomsPerWarp;
- auto warpIndex = atomIndex / atomsPerWarp;
for (auto orderIndex = 0; orderIndex < pmeOrder; orderIndex++)
{
- const auto gpuValueIndex = getSplineParamFullIndex(pmeOrder, orderIndex, dimIndex, warpIndex, atomWarpIndex);
+ const auto gpuValueIndex = getSplineParamFullIndex(pmeOrder, orderIndex, dimIndex, atomIndex, atomsPerWarp);
const auto cpuValueIndex = atomIndex * pmeOrder + orderIndex;
GMX_ASSERT(cpuValueIndex < atomCount * pmeOrder, "Atom spline data index out of bounds (while transforming GPU data layout for host)");
switch (transform)
const auto *kernelParamsPtr = pmeGpu->kernelParams.get();
GMX_ASSERT(kernelParamsPtr->atoms.nAtoms > 0, "No atom data in PME GPU spread");
-#if GMX_GPU == GMX_GPU_OPENCL
- const int c_spreadMaxThreadsPerBlock = 1; //FIXME make a runtime decision
-#endif
-
+ const int maxThreadsPerBlock = c_spreadMaxWarpsPerBlock * pmeGpu->programHandle_->impl_->warpSize;
+ // TODO take device limitations (CL_DEVICE_MAX_WORK_GROUP_SIZE) into account for all 3 kernels
const int order = pmeGpu->common->pme_order;
- const int atomsPerBlock = c_spreadMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM;
+ const int atomsPerBlock = maxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM;
// TODO: pick smaller block size in runtime if needed
// (e.g. on 660 Ti where 50% occupancy is ~25% faster than 100% occupancy with RNAse (~17.8k atoms))
// If doing so, change atomsPerBlock in the kernels as well.
GMX_ASSERT(false, "Implement grid ordering here and below for the kernel launch");
}
-#if GMX_GPU == GMX_GPU_OPENCL
- const int c_solveMaxThreadsPerBlock = 1; //FIXME make a runtime decision
-#endif
-
- const int maxBlockSize = c_solveMaxThreadsPerBlock;
- const int gridLineSize = pmeGpu->kernelParams->grid.complexGridSize[minorDim];
- const int gridLinesPerBlock = std::max(maxBlockSize / gridLineSize, 1);
- const int blocksPerGridLine = (gridLineSize + maxBlockSize - 1) / maxBlockSize;
- const int cellsPerBlock = gridLineSize * gridLinesPerBlock;
- const int blockSize = (cellsPerBlock + warp_size - 1) / warp_size * warp_size;
+ const int maxThreadsPerBlock = c_solveMaxWarpsPerBlock * pmeGpu->programHandle_->impl_->warpSize;
+ const int maxBlockSize = maxThreadsPerBlock;
+ const int gridLineSize = pmeGpu->kernelParams->grid.complexGridSize[minorDim];
+ const int gridLinesPerBlock = std::max(maxBlockSize / gridLineSize, 1);
+ const int blocksPerGridLine = (gridLineSize + maxBlockSize - 1) / maxBlockSize;
+ const int cellsPerBlock = gridLineSize * gridLinesPerBlock;
+ const size_t warpSize = pmeGpu->programHandle_->impl_->warpSize;
+ const int blockSize = (cellsPerBlock + warpSize - 1) / warpSize * warpSize;
KernelLaunchConfig config;
pme_gpu_copy_input_gather_atom_data(pmeGpu);
}
-#if GMX_GPU == GMX_GPU_OPENCL
- const int c_gatherMaxThreadsPerBlock = 1; //FIXME make a runtime decision
-#endif
- const int atomsPerBlock = (c_gatherMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM);
+ const int maxThreadsPerBlock = c_gatherMaxWarpsPerBlock * pmeGpu->programHandle_->impl_->warpSize;
+ const int atomsPerBlock = maxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM;
GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. gathering block size");
const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
* \param[in] order PME order
* \param[in] splineIndex Spline contribution index (from 0 to \p order - 1)
* \param[in] dimIndex Dimension index (from 0 to 2)
- * \param[in] warpIndex Warp index wrt the block.
- * \param[in] atomWarpIndex Atom index wrt the warp (0 or 1).
+ * \param[in] atomIndex Atom index wrt the block.
+ * \param[in] atomsPerWarp Number of atoms processed by a warp.
*
* \returns Index into theta or dtheta array using GPU layout.
*/
int getSplineParamFullIndex(int order,
int splineIndex,
int dimIndex,
- int warpIndex,
- int atomWarpIndex);
+ int atomIndex,
+ int atomsPerWarp);
/*! \libinternal \brief
* Get the normal/padded grid dimensions of the real-space PME grid on GPU. Only used in tests.
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2018, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Implements PmeGpuProgramImpl, which stores permanent PME GPU context-derived data,
+ * such as (compiled) kernel handles.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \ingroup module_ewald
+ */
+#include "gmxpre.h"
+
+#include "gromacs/gpu_utils/gmxopencl.h"
+#include "gromacs/gpu_utils/ocl_compiler.h"
+#include "gromacs/utility/stringutil.h"
+
+#include "pme-gpu-internal.h" // for GridOrdering enum
+#include "pme-gpu-program-impl.h"
+#include "pme-gpu-types-host.h"
+
+PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *deviceInfo)
+{
+ // Context creation (which should happen outside of this class: #2522
+ cl_platform_id platformId = deviceInfo->ocl_gpu_id.ocl_platform_id;
+ cl_device_id deviceId = deviceInfo->ocl_gpu_id.ocl_device_id;
+ cl_context_properties contextProperties[3];
+ contextProperties[0] = CL_CONTEXT_PLATFORM;
+ contextProperties[1] = (cl_context_properties) platformId;
+ contextProperties[2] = 0; /* Terminates the list of properties */
+
+ cl_int clError;
+ context = clCreateContext(contextProperties, 1, &deviceId, nullptr, nullptr, &clError);
+ if (clError != CL_SUCCESS)
+ {
+ const std::string errorString = gmx::formatString("Failed to create context for PME on GPU #%s:\n OpenCL error %d: %s",
+ deviceInfo->device_name, clError, ocl_get_error_string(clError).c_str());
+ GMX_THROW(gmx::InternalError(errorString));
+ }
+
+ warpSize = gmx::ocl::getWarpSize(context, deviceId);
+
+ //TODO: OpenCL kernel compilation should be here.
+}
+
+PmeGpuProgramImpl::~PmeGpuProgramImpl()
+{
+ // TODO: log releasing errors
+ clReleaseContext(context);
+}
#include "pme-gpu-program-impl.h"
-PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t * /*unused*/) {};
+PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t * /*unused*/) : warpSize(0){}
PmeGpuProgramImpl::~PmeGpuProgramImpl() = default;
#include "pme-gpu-program-impl.h"
-#include "pme-gpu-internal.h" // for GridOrdering enum
+#include "gromacs/gpu_utils/cuda_arch_utils.cuh" // only for warp_size
+
+#include "pme-gpu-internal.h" // for GridOrdering enum
#include "pme-gpu-types-host.h"
//! PME CUDA kernels forward declarations. Kernels are documented in their respective files.
PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *)
{
+ warpSize = warp_size;
+
// PME interpolation order
constexpr int pmeOrder = 4;
GMX_UNUSED_VALUE(pmeOrder);
using PmeKernelHandle = void *;
#endif
+ /*! \brief
+ * Maximum synchronous GPU thread group execution width.
+ * "Warp" is a CUDA term which we end up reusing in OpenCL kernels as well.
+ * For CUDA, this is a static value that comes from gromacs/gpu_utils/cuda_arch_utils.cuh;
+ * for OpenCL, we have to query it dynamically.
+ */
+ size_t warpSize;
+
//@{
/**
* Spread/spline kernels are compiled only for order of 4.
PmeGpuProgramStorage buildPmeGpuProgram(const gmx_device_info_t *deviceInfo)
{
+ if (!deviceInfo)
+ {
+ // This workaround is only needed for CodePath::CPU dummy in testhardwarecontexts.cpp
+ return nullptr;
+ }
return gmx::compat::make_unique<PmeGpuProgram>(deviceInfo);
}
* Removing warp dependency would also be nice (and would probably coincide with removing PME_SPREADGATHER_ATOMS_PER_WARP).
*
* \tparam order PME order
+ * \tparam atomsPerWarp Number of atoms processed by a warp
* \param[in] warpIndex Warp index wrt the block.
- * \param[in] atomWarpIndex Atom index wrt the warp (from 0 to PME_SPREADGATHER_ATOMS_PER_WARP - 1).
+ * \param[in] atomWarpIndex Atom index wrt the warp (from 0 to atomsPerWarp - 1).
*
* \returns Index into theta or dtheta array using GPU layout.
*/
-template <int order>
+template <int order, int atomsPerWarp>
int INLINE_EVERYWHERE getSplineParamIndexBase(int warpIndex, int atomWarpIndex)
{
- assert((atomWarpIndex >= 0) && (atomWarpIndex < PME_SPREADGATHER_ATOMS_PER_WARP));
+ assert((atomWarpIndex >= 0) && (atomWarpIndex < atomsPerWarp));
const int dimIndex = 0;
const int splineIndex = 0;
// The zeroes are here to preserve the full index formula for reference
- return (((splineIndex + order * warpIndex) * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex);
+ return (((splineIndex + order * warpIndex) * DIM + dimIndex) * atomsPerWarp + atomWarpIndex);
}
/*! \internal \brief
* This function consumes result of getSplineParamIndexBase() and adjusts it for \p dimIndex and \p splineIndex.
*
* \tparam order PME order
+ * \tparam atomsPerWarp Number of atoms processed by a warp
* \param[in] paramIndexBase Must be result of getSplineParamIndexBase().
* \param[in] dimIndex Dimension index (from 0 to 2)
* \param[in] splineIndex Spline contribution index (from 0 to \p order - 1)
*
* \returns Index into theta or dtheta array using GPU layout.
*/
-template <int order>
+template <int order, int atomsPerWarp>
int INLINE_EVERYWHERE getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex)
{
assert((dimIndex >= XX) && (dimIndex < DIM));
assert((splineIndex >= 0) && (splineIndex < order));
- return (paramIndexBase + (splineIndex * DIM + dimIndex) * PME_SPREADGATHER_ATOMS_PER_WARP);
+ return (paramIndexBase + (splineIndex * DIM + dimIndex) * atomsPerWarp);
}
#endif
float * __restrict__ gm_dtheta = kernelParams.atoms.d_dtheta;
int * __restrict__ gm_gridlineIndices = kernelParams.atoms.d_gridlineIndices;
+ const int atomsPerWarp = PME_SPREADGATHER_ATOMS_PER_WARP;
+
/* Fractional coordinates */
__shared__ float sm_fractCoords[atomsPerBlock * DIM];
/* Thread index w.r.t. warp */
const int threadWarpIndex = threadLocalId % warp_size;
/* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
- const int atomWarpIndex = threadWarpIndex % PME_SPREADGATHER_ATOMS_PER_WARP;
+ const int atomWarpIndex = threadWarpIndex % atomsPerWarp;
/* Atom index w.r.t. block/shared memory */
- const int atomIndexLocal = warpIndex * PME_SPREADGATHER_ATOMS_PER_WARP + atomWarpIndex;
+ const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
/* Atom index w.r.t. global memory */
const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
/* Spline contribution index in one dimension */
- const int orderIndex = threadWarpIndex / (PME_SPREADGATHER_ATOMS_PER_WARP * DIM);
+ const int orderIndex = threadWarpIndex / (atomsPerWarp * DIM);
/* Dimension index */
- const int dimIndex = (threadWarpIndex - orderIndex * (PME_SPREADGATHER_ATOMS_PER_WARP * DIM)) / PME_SPREADGATHER_ATOMS_PER_WARP;
+ const int dimIndex = (threadWarpIndex - orderIndex * (atomsPerWarp * DIM)) / atomsPerWarp;
/* Multi-purpose index of rvec/ivec atom data */
const int sharedMemoryIndex = atomIndexLocal * DIM + dimIndex;
*SPLINE_DATA_PTR(0) = div * (1.0f - dr) * SPLINE_DATA(0);
}
- const int thetaIndexBase = getSplineParamIndexBase<order>(warpIndex, atomWarpIndex);
+ const int thetaIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
const int thetaGlobalOffsetBase = atomIndexOffset * DIM * order;
/* Differentiation and storing the spline derivatives (dtheta) */
for (o = 0; o < order; o++)
#endif
{
- const int thetaIndex = getSplineParamIndex<order>(thetaIndexBase, dimIndex, o);
+ const int thetaIndex = getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
const float dtheta = ((o > 0) ? SPLINE_DATA(o - 1) : 0.0f) - SPLINE_DATA(o);
for (o = 0; o < order; o++)
#endif
{
- const int thetaIndex = getSplineParamIndex<order>(thetaIndexBase, dimIndex, o);
+ const int thetaIndex = getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
sm_theta[thetaIndex] = SPLINE_DATA(o);
float * __restrict__ gm_grid = kernelParams.grid.d_realGrid;
+ const int atomsPerWarp = PME_SPREADGATHER_ATOMS_PER_WARP;
+
const int nx = kernelParams.grid.realGridSize[XX];
const int ny = kernelParams.grid.realGridSize[YY];
const int nz = kernelParams.grid.realGridSize[ZZ];
}
/* Atom index w.r.t. warp - alternating 0 1 0 1 .. */
- const int atomWarpIndex = atomIndexLocal % PME_SPREADGATHER_ATOMS_PER_WARP;
+ const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
/* Warp index w.r.t. block - could probably be obtained easier? */
- const int warpIndex = atomIndexLocal / PME_SPREADGATHER_ATOMS_PER_WARP;
+ const int warpIndex = atomIndexLocal / atomsPerWarp;
- const int splineIndexBase = getSplineParamIndexBase<order>(warpIndex, atomWarpIndex);
- const int splineIndexZ = getSplineParamIndex<order>(splineIndexBase, ZZ, ithz);
+ const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
+ const int splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
const float thetaZ = sm_theta[splineIndexZ];
- const int splineIndexY = getSplineParamIndex<order>(splineIndexBase, YY, ithy);
+ const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
const float thetaY = sm_theta[splineIndexY];
const float constVal = thetaZ * thetaY * sm_coefficients[atomIndexLocal];
assert(isfinite(constVal));
ix -= nx;
}
const int gridIndexGlobal = ix * pny * pnz + constOffset;
- const int splineIndexX = getSplineParamIndex<order>(splineIndexBase, XX, ithx);
+ const int splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
const float thetaX = sm_theta[splineIndexX];
assert(isfinite(thetaX));
assert(isfinite(gm_grid[gridIndexGlobal]));
return Path::normalize(sourceRootPath);
}
-/*! \brief Get the warp size reported by device
- *
- * This is platform implementation dependant and seems to only work on the Nvidia and AMD platforms!
- * Nvidia reports 32, AMD for GPU 64. Ignore the rest
- *
- * \param context Current OpenCL context
- * \param deviceId OpenCL device with the context
- * \return cl_int value of the warp size
- *
- * \throws InternalError if an OpenCL error was encountered
- */
-static size_t
-getWarpSize(cl_context context, cl_device_id deviceId)
+size_t getWarpSize(cl_context context, cl_device_id deviceId)
{
cl_int cl_error;
const char *warpSizeKernel = "__kernel void test(__global int* test){test[get_local_id(0)] = 0;}";
namespace ocl
{
+/*! \brief Get the warp size reported by device
+ *
+ * This is platform implementation dependent and seems to only work on the Nvidia and AMD platforms!
+ * Nvidia reports 32, AMD for GPU 64. Intel seems to report 16, but that is not correct,
+ * as it execution width can be between 8-32 and it's picked per-kernel at compile-time.
+ * Therefore, for Intel it should actually be queried separately for each kernel (Redmine #2520).
+ *
+ * \param context Current OpenCL context
+ * \param deviceId OpenCL device with the context
+ * \return cl_int value of the warp size
+ *
+ * \throws InternalError if an OpenCL error was encountered
+ */
+size_t getWarpSize(cl_context context, cl_device_id deviceId);
+
/*! \brief Compile the specified kernel for the context and device.
*
* \param[out] fplog Open file pointer for log output