*/
#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
#include "gromacs/math/utilities.h"
#include "gromacs/pbcutil/ishift.h"
/* Note that floating-point constants in CUDA code should be suffixed
* code that is in double precision.
*/
-#if GMX_PTX_ARCH < 300
-#error "nbnxn_cuda_kernel.cuh included with GMX_PTX_ARCH < 300"
+#if GMX_PTX_ARCH < 300 && GMX_PTX_ARCH != 0
+#error "nbnxn_cuda_kernel.cuh included with GMX_PTX_ARCH < 300 or host pass"
#endif
#if defined EL_EWALD_ANA || defined EL_EWALD_TAB
Kernel launch parameters:
- #blocks = #pair lists, blockId = pair list Id
- #threads = NTHREAD_Z * c_clSize^2
- - shmem = see nbnxn_cuda.cu:calc_shmem_required()
+ - shmem = see nbnxn_cuda.cu:calc_shmem_required_nonbonded()
Each thread calculates an i force-component taking one pair of i-j atoms.
*/
#ifdef EL_RF
float two_k_rf = nbparam.two_k_rf;
#endif
-#ifdef EL_EWALD_TAB
- float coulomb_tab_scale = nbparam.coulomb_tab_scale;
-#endif
#ifdef EL_EWALD_ANA
float beta2 = nbparam.ewald_beta*nbparam.ewald_beta;
float beta3 = nbparam.ewald_beta*nbparam.ewald_beta*nbparam.ewald_beta;
#endif
#ifdef PRUNE_NBL
- float rlist_sq = nbparam.rlist_sq;
+ float rlist_sq = nbparam.rlistOuter_sq;
#endif
#ifdef CALC_ENERGIES
ci = sci * c_numClPerSupercl + tidxj;
ai = ci * c_clSize + tidxi;
- xqbuf = xq[ai] + shift_vec[nb_sci.shift];
+ float *shiftptr = (float *)&shift_vec[nb_sci.shift];
+ xqbuf = xq[ai] + make_float4(LDG(shiftptr), LDG(shiftptr + 1), LDG(shiftptr + 2), 0.0f);
xqbuf.w *= nbparam.epsfac;
xqib[tidxj * c_clSize + tidxi] = xqbuf;
#endif
#ifdef LJ_EWALD
+ #if DISABLE_CUDA_TEXTURES
+ E_lj += LDG(&nbparam.nbfp[atom_types[(sci*c_numClPerSupercl + i)*c_clSize + tidxi]*(ntypes + 1)*2]);
+ #else
E_lj += tex1Dfetch<float>(nbparam.nbfp_texobj, atom_types[(sci*c_numClPerSupercl + i)*c_clSize + tidxi]*(ntypes + 1)*2);
+ #endif
#endif
}
#endif /* CALC_ENERGIES */
+#ifdef EXCLUSION_FORCES
+ const int nonSelfInteraction = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
+#endif
+
int j4LoopStart = cij4_start + tidxz;
/* loop over the j clusters = seen by any of the atoms in the current super-cluster */
for (j4 = j4LoopStart; j4 < cij4_end; j4 += NTHREAD_Z)
#endif
{
/* Pre-load cj into shared memory on both warps separately */
- if ((tidxj == 0 || tidxj == 4) && tidxi < c_nbnxnGpuJgroupSize)
+ if ((tidxj == 0 | tidxj == 4) & (tidxi < c_nbnxnGpuJgroupSize))
{
cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi];
}
/* cutoff & exclusion check */
#ifdef EXCLUSION_FORCES
- if (r2 < rcoulomb_sq *
- (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+ if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj)))
#else
- if (r2 < rcoulomb_sq * int_bit)
+ if ((r2 < rcoulomb_sq) * int_bit)
#endif
{
/* load the rest of the i-atom parameters */
#ifndef LJ_COMB
/* LJ 6*C6 and 12*C12 */
typei = atib[i * c_clSize + tidxi];
- c6 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * (ntypes * typei + typej));
- c12 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * (ntypes * typei + typej) + 1);
+ fetch_nbfp_c6_c12(c6, c12, nbparam, ntypes * typei + typej);
#else
ljcp_i = ljcpib[i * c_clSize + tidxi];
#ifdef LJ_COMB_GEOM
#ifdef LJ_POT_SWITCH
#ifdef CALC_ENERGIES
- calculate_potential_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+ calculate_potential_switch_F_E(nbparam, inv_r, r2, &F_invr, &E_lj_p);
#else
- calculate_potential_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+ calculate_potential_switch_F(nbparam, inv_r, r2, &F_invr, &E_lj_p);
#endif /* CALC_ENERGIES */
#endif /* LJ_POT_SWITCH */
F_invr += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
#elif defined EL_EWALD_TAB
F_invr += qi * qj_f * (int_bit*inv_r2 -
- interpolate_coulomb_force_r(nbparam.coulomb_tab_texobj, r2 * inv_r, coulomb_tab_scale)) * inv_r;
+ interpolate_coulomb_force_r(nbparam, r2 * inv_r)) * inv_r;
#endif /* EL_EWALD_ANA/TAB */
#ifdef CALC_ENERGIES