/*! \brief Log of the i and j cluster size.
* change this together with c_clSize !*/
-static const int c_clSizeLog2 = 3;
+static const int c_clSizeLog2 = 3;
/*! \brief Square of cluster size. */
-static const int c_clSizeSq = c_clSize*c_clSize;
+static const int c_clSizeSq = c_clSize*c_clSize;
/*! \brief j-cluster size after split (4 in the current implementation). */
-static const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
+static const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
/*! \brief Stride in the force accumualation buffer */
-static const int c_fbufStride = c_clSizeSq;
+static const int c_fbufStride = c_clSizeSq;
/*! \brief i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
-static const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
+static const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
-static const float c_oneSixth = 0.16666667f;
-static const float c_oneTwelveth = 0.08333333f;
+static const float c_oneSixth = 0.16666667f;
+static const float c_oneTwelveth = 0.08333333f;
/*! Convert LJ sigma,epsilon parameters to C6,C12. */
#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
static __forceinline__ __device__
void reduce_force_j_warp_shfl(float3 f, float3 *fout,
- int tidxi, int aidx)
+ int tidxi, int aidx,
+ const unsigned int activemask)
{
- f.x += __shfl_down(f.x, 1);
- f.y += __shfl_up (f.y, 1);
- f.z += __shfl_down(f.z, 1);
+ f.x += gmx_shfl_down_sync(activemask, f.x, 1);
+ f.y += gmx_shfl_up_sync (activemask, f.y, 1);
+ f.z += gmx_shfl_down_sync(activemask, f.z, 1);
if (tidxi & 1)
{
f.x = f.y;
}
- f.x += __shfl_down(f.x, 2);
- f.z += __shfl_up (f.z, 2);
+ f.x += gmx_shfl_down_sync(activemask, f.x, 2);
+ f.z += gmx_shfl_up_sync (activemask, f.z, 2);
if (tidxi & 2)
{
f.x = f.z;
}
- f.x += __shfl_down(f.x, 4);
+ f.x += gmx_shfl_down_sync(activemask, f.x, 4);
if (tidxi < 3)
{
static __forceinline__ __device__
void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
float *fshift_buf, bool bCalcFshift,
- int tidxj, int aidx)
+ int tidxj, int aidx,
+ const unsigned int activemask)
{
- fin.x += __shfl_down(fin.x, c_clSize);
- fin.y += __shfl_up (fin.y, c_clSize);
- fin.z += __shfl_down(fin.z, c_clSize);
+ fin.x += gmx_shfl_down_sync(activemask, fin.x, c_clSize);
+ fin.y += gmx_shfl_up_sync (activemask, fin.y, c_clSize);
+ fin.z += gmx_shfl_down_sync(activemask, fin.z, c_clSize);
if (tidxj & 1)
{
fin.x = fin.y;
}
- fin.x += __shfl_down(fin.x, 2*c_clSize);
- fin.z += __shfl_up (fin.z, 2*c_clSize);
+ fin.x += gmx_shfl_down_sync(activemask, fin.x, 2*c_clSize);
+ fin.z += gmx_shfl_up_sync (activemask, fin.z, 2*c_clSize);
if (tidxj & 2)
{
static __forceinline__ __device__
void reduce_energy_warp_shfl(float E_lj, float E_el,
float *e_lj, float *e_el,
- int tidx)
+ int tidx,
+ const unsigned int activemask)
{
int i, sh;
#pragma unroll 5
for (i = 0; i < 5; i++)
{
- E_lj += __shfl_down(E_lj, sh);
- E_el += __shfl_down(E_el, sh);
+ E_lj += gmx_shfl_down_sync(activemask, E_lj, sh);
+ E_el += gmx_shfl_down_sync(activemask, E_el, sh);
sh += sh;
}