*
* Note that the current kernel implementation only supports NTHREAD_Z > 1 with
* shuffle-based reduction, hence CC >= 3.0.
+ *
+ *
+ * NOTEs / TODO on Volta / CUDA 9 support extensions:
+ * - the current way of computing active mask using ballot_sync() should be
+ * reconsidered: we can compute all masks with bitwise ops iso ballot and
+ * secondly, all conditionals are warp-uniform, so the sync is not needed;
+ * - reconsider the use of __syncwarp(): its only role is currently to prevent
+ * WAR hazard due to the cj preload; we should try to replace it with direct
+ * loads (which may be faster given the improved L1 on Volta).
*/
/* Kernel launch bounds for different compute capabilities. The value of NTHREAD_Z
#endif /* CALC_ENERGIES */
#ifdef EXCLUSION_FORCES
- const int nonSelfInteraction = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
+ const int nonSelfInteraction = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
#endif
+ int j4LoopStart = cij4_start + tidxz;
+ unsigned int j4LoopThreadMask = gmx_ballot_sync(c_fullWarpMask, j4LoopStart < cij4_end);
/* loop over the j clusters = seen by any of the atoms in the current super-cluster */
- for (j4 = cij4_start + tidxz; j4 < cij4_end; j4 += NTHREAD_Z)
+ for (j4 = j4LoopStart; j4 < cij4_end; j4 += NTHREAD_Z)
{
wexcl_idx = pl_cj4[j4].imei[widx].excl_ind;
imask = pl_cj4[j4].imei[widx].imask;
wexcl = excl[wexcl_idx].pair[(tidx) & (warp_size - 1)];
+ unsigned int imaskSkipConditionThreadMask = j4LoopThreadMask;
#ifndef PRUNE_NBL
+ imaskSkipConditionThreadMask = gmx_ballot_sync(j4LoopThreadMask, imask);
if (imask)
#endif
{
{
cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi];
}
+ gmx_syncwarp(imaskSkipConditionThreadMask);
/* Unrolling this loop
- with pruning leads to register spilling;
Tested with up to nvcc 7.5 */
for (jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
{
- if (imask & (superClInteractionMask << (jm * c_numClPerSupercl)))
+ const unsigned int jmSkipCondition = imask & (superClInteractionMask << (jm * c_numClPerSupercl));
+ const unsigned int jmSkipConditionThreadMask = gmx_ballot_sync(imaskSkipConditionThreadMask, jmSkipCondition);
+ if (jmSkipCondition)
{
mask_ji = (1U << (jm * c_numClPerSupercl));
#endif
for (i = 0; i < c_numClPerSupercl; i++)
{
- if (imask & mask_ji)
+ const unsigned int iInnerSkipCondition = imask & mask_ji;
+ const unsigned int iInnerSkipConditionThreadMask = gmx_ballot_sync(jmSkipConditionThreadMask, iInnerSkipCondition);
+ if (iInnerSkipCondition)
{
ci = sci * c_numClPerSupercl + i; /* i cluster index */
/* If _none_ of the atoms pairs are in cutoff range,
the bit corresponding to the current
cluster-pair in imask gets set to 0. */
- if (!__any(r2 < rlist_sq))
+ if (!gmx_any_sync(iInnerSkipConditionThreadMask, r2 < rlist_sq))
{
imask &= ~mask_ji;
}
}
/* reduce j forces */
- reduce_force_j_warp_shfl(fcj_buf, f, tidxi, aj);
+ reduce_force_j_warp_shfl(fcj_buf, f, tidxi, aj, jmSkipConditionThreadMask);
}
}
#ifdef PRUNE_NBL
pl_cj4[j4].imei[widx].imask = imask;
#endif
}
+ // avoid shared memory WAR hazards between loop iterations
+ gmx_syncwarp(j4LoopThreadMask);
+ // update thread mask for next loop iteration
+ j4LoopThreadMask = gmx_ballot_sync(j4LoopThreadMask, (j4 + NTHREAD_Z) < cij4_end);
}
/* skip central shifts when summing shift forces */
ai = (sci * c_numClPerSupercl + i) * c_clSize + tidxi;
reduce_force_i_warp_shfl(fci_buf[i], f,
&fshift_buf, bCalcFshift,
- tidxj, ai);
+ tidxj, ai, c_fullWarpMask);
}
/* add up local shift forces into global mem, tidxj indexes x,y,z */
#ifdef CALC_ENERGIES
/* reduce the energies over warps and store into global memory */
- reduce_energy_warp_shfl(E_lj, E_el, e_lj, e_el, tidx);
+ reduce_energy_warp_shfl(E_lj, E_el, e_lj, e_el, tidx, c_fullWarpMask);
#endif
}
#endif /* FUNCTION_DECLARATION_ONLY */