}
#if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
- /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was gmx_syncwarp() in CUDA.
+ /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was __syncwarp() in CUDA.
* #2519
*/
barrier(CLK_LOCAL_MEM_FENCE);
static_assert(atomDataSize <= warp_size, "TODO: rework for atomDataSize > warp_size (order 8 or larger)");
const int width = atomDataSize;
- fx += gmx_shfl_down_sync(activeMask, fx, 1, width);
- fy += gmx_shfl_up_sync (activeMask, fy, 1, width);
- fz += gmx_shfl_down_sync(activeMask, fz, 1, width);
+ fx += __shfl_down_sync(activeMask, fx, 1, width);
+ fy += __shfl_up_sync (activeMask, fy, 1, width);
+ fz += __shfl_down_sync(activeMask, fz, 1, width);
if (splineIndex & 1)
{
fx = fy;
}
- fx += gmx_shfl_down_sync(activeMask, fx, 2, width);
- fz += gmx_shfl_up_sync (activeMask, fz, 2, width);
+ fx += __shfl_down_sync(activeMask, fx, 2, width);
+ fz += __shfl_up_sync (activeMask, fz, 2, width);
if (splineIndex & 2)
{
// We have to just further reduce those groups of 4
for (int delta = 4; delta < atomDataSize; delta <<= 1)
{
- fx += gmx_shfl_down_sync(activeMask, fx, delta, width);
+ fx += __shfl_down_sync(activeMask, fx, delta, width);
}
const int dimIndex = splineIndex;
}
}
- gmx_syncwarp();
+ __syncwarp();
const float n = read_grid_size(realGridSizeFP, dimIndex);
const int atomIndex = sourceIndex / minStride;
sm_forces[forceIndexLocal] = result;
}
- gmx_syncwarp();
+ __syncwarp();
assert(atomsPerBlock <= warp_size);
/* Writing or adding the final forces component-wise, single warp */
const unsigned int activeMask = c_fullWarpMask;
/* Making pair sums */
- virxx += gmx_shfl_down_sync(activeMask, virxx, 1, width);
- viryy += gmx_shfl_up_sync (activeMask, viryy, 1, width);
- virzz += gmx_shfl_down_sync(activeMask, virzz, 1, width);
- virxy += gmx_shfl_up_sync (activeMask, virxy, 1, width);
- virxz += gmx_shfl_down_sync(activeMask, virxz, 1, width);
- viryz += gmx_shfl_up_sync (activeMask, viryz, 1, width);
- energy += gmx_shfl_down_sync(activeMask, energy, 1, width);
+ virxx += __shfl_down_sync(activeMask, virxx, 1, width);
+ viryy += __shfl_up_sync (activeMask, viryy, 1, width);
+ virzz += __shfl_down_sync(activeMask, virzz, 1, width);
+ virxy += __shfl_up_sync (activeMask, virxy, 1, width);
+ virxz += __shfl_down_sync(activeMask, virxz, 1, width);
+ viryz += __shfl_up_sync (activeMask, viryz, 1, width);
+ energy += __shfl_down_sync(activeMask, energy, 1, width);
if (threadLocalId & 1)
{
virxx = viryy; // virxx now holds virxx and viryy pair sums
}
/* Making quad sums */
- virxx += gmx_shfl_down_sync(activeMask, virxx, 2, width);
- virzz += gmx_shfl_up_sync (activeMask, virzz, 2, width);
- virxz += gmx_shfl_down_sync(activeMask, virxz, 2, width);
- energy += gmx_shfl_up_sync (activeMask, energy, 2, width);
+ virxx += __shfl_down_sync(activeMask, virxx, 2, width);
+ virzz += __shfl_up_sync (activeMask, virzz, 2, width);
+ virxz += __shfl_down_sync(activeMask, virxz, 2, width);
+ energy += __shfl_up_sync (activeMask, energy, 2, width);
if (threadLocalId & 2)
{
virxx = virzz; // virxx now holds quad sums of virxx, virxy, virzz and virxy
}
/* Making octet sums */
- virxx += gmx_shfl_down_sync(activeMask, virxx, 4, width);
- virxz += gmx_shfl_up_sync (activeMask, virxz, 4, width);
+ virxx += __shfl_down_sync(activeMask, virxx, 4, width);
+ virxz += __shfl_up_sync (activeMask, virxz, 4, width);
if (threadLocalId & 4)
{
virxx = virxz; // virxx now holds all 7 components' octet sums + unused paddings
#pragma unroll
for (int delta = 8; delta < width; delta <<= 1)
{
- virxx += gmx_shfl_down_sync(activeMask, virxx, delta, width);
+ virxx += __shfl_down_sync(activeMask, virxx, delta, width);
}
/* Now first 7 threads of each warp have the full output contributions in virxx */
#pragma unroll
for (int delta = stride; delta < warp_size; delta <<= 1)
{
- output += gmx_shfl_down_sync(activeMask, output, delta, warp_size);
+ output += __shfl_down_sync(activeMask, output, delta, warp_size);
}
/* Final output */
if (validComponentIndex)
sm_fractCoords, gm_theta, gm_dtheta, gm_gridlineIndices,
gm_fractShiftsTable, gm_gridlineIndicesTable);
#if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
- /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was gmx_syncwarp() in CUDA.
+ /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was __syncwarp() in CUDA.
* #2519
*/
barrier(CLK_LOCAL_MEM_FENCE);
__syncthreads();
calculate_splines<order, atomsPerBlock>(kernelParams, atomIndexOffset, (const float3 *)sm_coordinates,
sm_coefficients, sm_theta, sm_gridlineIndices);
- gmx_syncwarp();
+ __syncwarp();
}
else
{
#ifndef CUDA_ARCH_UTILS_CUH_
#define CUDA_ARCH_UTILS_CUH_
-#include "config.h"
-
#include "gromacs/utility/basedefinitions.h"
/*! \file
*/
static const unsigned int c_fullWarpMask = 0xffffffff;
-/* Below are backward-compatibility wrappers for CUDA 9 warp-wide intrinsics. */
-
-/*! \brief Compatibility wrapper around the CUDA __syncwarp() instrinsic. */
-static __forceinline__ __device__
-void gmx_syncwarp(const unsigned int activeMask = c_fullWarpMask)
-{
-#if GMX_CUDA_VERSION < 9000
- /* no sync needed on pre-Volta. */
- GMX_UNUSED_VALUE(activeMask);
-#else
- __syncwarp(activeMask);
-#endif
-}
-
-/*! \brief Compatibility wrapper around the CUDA __ballot()/__ballot_sync() instrinsic. */
-static __forceinline__ __device__
-unsigned int gmx_ballot_sync(const unsigned int activeMask,
- const int pred)
-{
-#if GMX_CUDA_VERSION < 9000
- GMX_UNUSED_VALUE(activeMask);
- return __ballot(pred);
-#else
- return __ballot_sync(activeMask, pred);
-#endif
-}
-
-/*! \brief Compatibility wrapper around the CUDA __any()/__any_sync() instrinsic. */
-static __forceinline__ __device__
-int gmx_any_sync(const unsigned int activeMask,
- const int pred)
-{
-#if GMX_CUDA_VERSION < 9000
- GMX_UNUSED_VALUE(activeMask);
- return __any(pred);
-#else
- return __any_sync(activeMask, pred);
-#endif
-}
-
-/*! \brief Compatibility wrapper around the CUDA __shfl_up()/__shfl_up_sync() instrinsic. */
-template <typename T>
-static __forceinline__ __device__
-T gmx_shfl_up_sync(const unsigned int activeMask,
- const T var,
- unsigned int offset,
- int width = warp_size)
-{
-#if GMX_CUDA_VERSION < 9000
- GMX_UNUSED_VALUE(activeMask);
- return __shfl_up(var, offset, width);
-#else
- return __shfl_up_sync(activeMask, var, offset, width);
-#endif
-}
-
-/*! \brief Compatibility wrapper around the CUDA __shfl_down()/__shfl_down_sync() instrinsic. */
-template <typename T>
-static __forceinline__ __device__
-T gmx_shfl_down_sync(const unsigned int activeMask,
- const T var,
- unsigned int offset,
- int width = warp_size)
-{
-#if GMX_CUDA_VERSION < 9000
- GMX_UNUSED_VALUE(activeMask);
- return __shfl_down(var, offset, width);
-#else
- return __shfl_down_sync(activeMask, var, offset, width);
-#endif
-}
-
/*! \brief Allow disabling CUDA textures using the GMX_DISABLE_CUDA_TEXTURES macro.
*
* Only texture objects supported.
{
cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi];
}
- gmx_syncwarp(c_fullWarpMask);
+ __syncwarp(c_fullWarpMask);
/* Unrolling this loop
- with pruning leads to register spilling;
/* If _none_ of the atoms pairs are in cutoff range,
the bit corresponding to the current
cluster-pair in imask gets set to 0. */
- if (!gmx_any_sync(c_fullWarpMask, r2 < rlist_sq))
+ if (!__any_sync(c_fullWarpMask, r2 < rlist_sq))
{
imask &= ~mask_ji;
}
#endif
}
// avoid shared memory WAR hazards between loop iterations
- gmx_syncwarp(c_fullWarpMask);
+ __syncwarp(c_fullWarpMask);
}
/* skip central shifts when summing shift forces */
{
cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi];
}
- gmx_syncwarp(c_fullWarpMask);
+ __syncwarp(c_fullWarpMask);
#pragma unroll 4
for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
/* If _none_ of the atoms pairs are in rlistOuter
range, the bit corresponding to the current
cluster-pair in imask gets set to 0. */
- if (haveFreshList && !gmx_any_sync(c_fullWarpMask, r2 < rlistOuter_sq))
+ if (haveFreshList && !__any_sync(c_fullWarpMask, r2 < rlistOuter_sq))
{
imaskFull &= ~mask_ji;
}
/* If any atom pair is within range, set the bit
corresponding to the current cluster-pair. */
- if (gmx_any_sync(c_fullWarpMask, r2 < rlistInner_sq))
+ if (__any_sync(c_fullWarpMask, r2 < rlistInner_sq))
{
imaskNew |= mask_ji;
}
plist.cj4[j4].imei[widx].imask = imaskNew;
}
// avoid shared memory WAR hazards between loop iterations
- gmx_syncwarp(c_fullWarpMask);
+ __syncwarp(c_fullWarpMask);
}
}
#endif /* FUNCTION_DECLARATION_ONLY */
int tidxi, int aidx,
const unsigned int activemask)
{
- f.x += gmx_shfl_down_sync(activemask, f.x, 1);
- f.y += gmx_shfl_up_sync (activemask, f.y, 1);
- f.z += gmx_shfl_down_sync(activemask, f.z, 1);
+ f.x += __shfl_down_sync(activemask, f.x, 1);
+ f.y += __shfl_up_sync (activemask, f.y, 1);
+ f.z += __shfl_down_sync(activemask, f.z, 1);
if (tidxi & 1)
{
f.x = f.y;
}
- f.x += gmx_shfl_down_sync(activemask, f.x, 2);
- f.z += gmx_shfl_up_sync (activemask, f.z, 2);
+ f.x += __shfl_down_sync(activemask, f.x, 2);
+ f.z += __shfl_up_sync (activemask, f.z, 2);
if (tidxi & 2)
{
f.x = f.z;
}
- f.x += gmx_shfl_down_sync(activemask, f.x, 4);
+ f.x += __shfl_down_sync(activemask, f.x, 4);
if (tidxi < 3)
{
int tidxj, int aidx,
const unsigned int activemask)
{
- fin.x += gmx_shfl_down_sync(activemask, fin.x, c_clSize);
- fin.y += gmx_shfl_up_sync (activemask, fin.y, c_clSize);
- fin.z += gmx_shfl_down_sync(activemask, fin.z, c_clSize);
+ fin.x += __shfl_down_sync(activemask, fin.x, c_clSize);
+ fin.y += __shfl_up_sync (activemask, fin.y, c_clSize);
+ fin.z += __shfl_down_sync(activemask, fin.z, c_clSize);
if (tidxj & 1)
{
fin.x = fin.y;
}
- fin.x += gmx_shfl_down_sync(activemask, fin.x, 2*c_clSize);
- fin.z += gmx_shfl_up_sync (activemask, fin.z, 2*c_clSize);
+ fin.x += __shfl_down_sync(activemask, fin.x, 2*c_clSize);
+ fin.z += __shfl_up_sync (activemask, fin.z, 2*c_clSize);
if (tidxj & 2)
{
#pragma unroll 5
for (i = 0; i < 5; i++)
{
- E_lj += gmx_shfl_down_sync(activemask, E_lj, sh);
- E_el += gmx_shfl_down_sync(activemask, E_el, sh);
+ E_lj += __shfl_down_sync(activemask, E_lj, sh);
+ E_el += __shfl_down_sync(activemask, E_el, sh);
sh += sh;
}