* Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
*/
#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_SIMD_X86_AVX_256_OR_HIGHER))
-#define CUTOFF_BLENDV
+#define NBNXN_CUTOFF_USE_BLENDV
#endif
/* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv.
* This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7).
* Tested with icc 13.
*/
#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_SIMD_X86_AVX_256_OR_HIGHER
-#define CUTOFF_BLENDV
+#define NBNXN_CUTOFF_USE_BLENDV
#endif
#endif
gmx_simd_real_t tx_S2, ty_S2, tz_S2;
gmx_simd_real_t rsq_S0, rinv_S0, rinvsq_S0;
gmx_simd_real_t rsq_S2, rinv_S2, rinvsq_S2;
-#ifndef CUTOFF_BLENDV
+#ifndef NBNXN_CUTOFF_USE_BLENDV
/* wco: within cut-off, mask of all 1's or 0's */
gmx_simd_bool_t wco_S0;
gmx_simd_bool_t wco_S2;
gmx_simd_bool_t wco_vdw_S2;
#endif
#endif
+
+#if (defined CALC_COULOMB && defined CALC_COUL_TAB) || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
+ gmx_simd_real_t r_S0;
+ gmx_simd_real_t r_S2;
+#endif
+
+#if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
+ gmx_simd_real_t rsw_S0, rsw2_S0, rsw2_r_S0;
+#ifndef HALF_LJ
+ gmx_simd_real_t rsw_S2, rsw2_S2, rsw2_r_S2;
+#endif
+#endif
+
#ifdef CALC_COULOMB
#ifdef CHECK_EXCLS
/* 1/r masked with the interaction mask */
gmx_simd_real_t frcoul_S2;
#ifdef CALC_COUL_TAB
/* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
- gmx_simd_real_t r_S0, rs_S0, rf_S0, frac_S0;
- gmx_simd_real_t r_S2, rs_S2, rf_S2, frac_S2;
+ gmx_simd_real_t rs_S0, rf_S0, frac_S0;
+ gmx_simd_real_t rs_S2, rf_S2, frac_S2;
/* Table index: rs truncated to an int */
gmx_simd_int32_t ti_S0, ti_S2;
/* Linear force table values */
#endif
#endif
- gmx_simd_real_t FrLJ6_S0, FrLJ12_S0;
+ gmx_simd_real_t FrLJ6_S0, FrLJ12_S0, frLJ_S0;
#ifndef HALF_LJ
- gmx_simd_real_t FrLJ6_S2, FrLJ12_S2;
+ gmx_simd_real_t FrLJ6_S2, FrLJ12_S2, frLJ_S2;
#endif
-#ifdef CALC_ENERGIES
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
gmx_simd_real_t VLJ6_S0, VLJ12_S0, VLJ_S0;
#ifndef HALF_LJ
gmx_simd_real_t VLJ6_S2, VLJ12_S2, VLJ_S2;
rsq_S0 = gmx_simd_calc_rsq_r(dx_S0, dy_S0, dz_S0);
rsq_S2 = gmx_simd_calc_rsq_r(dx_S2, dy_S2, dz_S2);
-#ifndef CUTOFF_BLENDV
+#ifndef NBNXN_CUTOFF_USE_BLENDV
wco_S0 = gmx_simd_cmplt_r(rsq_S0, rc2_S);
wco_S2 = gmx_simd_cmplt_r(rsq_S2, rc2_S);
#endif
#endif /* CALC_LJ */
-#ifndef CUTOFF_BLENDV
+#ifndef NBNXN_CUTOFF_USE_BLENDV
rinv_S0 = gmx_simd_blendzero_r(rinv_S0, wco_S0);
rinv_S2 = gmx_simd_blendzero_r(rinv_S2, wco_S2);
#else
/* We need to mask (or limit) rsq for the cut-off,
* as large distances can cause an overflow in gmx_pmecorrF/V.
*/
-#ifndef CUTOFF_BLENDV
+#ifndef NBNXN_CUTOFF_USE_BLENDV
brsq_S0 = gmx_simd_mul_r(beta2_S, gmx_simd_blendzero_r(rsq_S0, wco_S0));
brsq_S2 = gmx_simd_mul_r(beta2_S, gmx_simd_blendzero_r(rsq_S2, wco_S2));
#else
rinvsix_S2 = gmx_simd_blendzero_r(rinvsix_S2, interact_S2);
#endif
#endif
-#ifdef VDW_CUTOFF_CHECK
- rinvsix_S0 = gmx_simd_blendzero_r(rinvsix_S0, wco_vdw_S0);
-#ifndef HALF_LJ
- rinvsix_S2 = gmx_simd_blendzero_r(rinvsix_S2, wco_vdw_S2);
-#endif
-#endif
+
+#ifndef LJ_FORCE_SWITCH
+ /* We have plain LJ with simple C6/6 C12/12 coefficients */
FrLJ6_S0 = gmx_simd_mul_r(c6_S0, rinvsix_S0);
#ifndef HALF_LJ
FrLJ6_S2 = gmx_simd_mul_r(c6_S2, rinvsix_S2);
#ifndef HALF_LJ
FrLJ12_S2 = gmx_simd_mul_r(c12_S2, gmx_simd_mul_r(rinvsix_S2, rinvsix_S2));
#endif
+#endif
+
+#if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
+ /* We switch the LJ force */
+ r_S0 = gmx_simd_mul_r(rsq_S0, rinv_S0);
+ rsw_S0 = gmx_simd_max_r(gmx_simd_sub_r(r_S0, rswitch_S), zero_S);
+ rsw2_S0 = gmx_simd_mul_r(rsw_S0, rsw_S0);
+ rsw2_r_S0 = gmx_simd_mul_r(rsw2_S0, r_S0);
+#ifndef HALF_LJ
+ r_S2 = gmx_simd_mul_r(rsq_S2, rinv_S2);
+ rsw_S2 = gmx_simd_max_r(gmx_simd_sub_r(r_S2, rswitch_S), zero_S);
+ rsw2_S2 = gmx_simd_mul_r(rsw_S2, rsw_S2);
+ rsw2_r_S2 = gmx_simd_mul_r(rsw2_S2, r_S2);
+#endif
+#endif
+
+#ifdef LJ_FORCE_SWITCH
+
+#define add_fr_switch(fr, rsw, rsw2_r, c2, c3) gmx_simd_fmadd_r(gmx_simd_fmadd_r(c3, rsw, c2), rsw2_r, fr)
+
+ FrLJ6_S0 = gmx_simd_mul_r(c6_S0, add_fr_switch(rinvsix_S0, rsw_S0, rsw2_r_S0, p6_fc2_S, p6_fc3_S));
+#ifndef HALF_LJ
+ FrLJ6_S2 = gmx_simd_mul_r(c6_S2, add_fr_switch(rinvsix_S2, rsw_S2, rsw2_r_S2, p6_fc2_S, p6_fc3_S));
+#endif
+ FrLJ12_S0 = gmx_simd_mul_r(c12_S0, add_fr_switch(gmx_simd_mul_r(rinvsix_S0, rinvsix_S0), rsw_S0, rsw2_r_S0, p12_fc2_S, p12_fc3_S));
+#ifndef HALF_LJ
+ FrLJ12_S2 = gmx_simd_mul_r(c12_S2, add_fr_switch(gmx_simd_mul_r(rinvsix_S2, rinvsix_S2), rsw_S2, rsw2_r_S2, p12_fc2_S, p12_fc3_S));
+#endif
+#undef add_fr_switch
+#endif /* LJ_FORCE_SWITCH */
+
#endif /* not LJ_COMB_LB */
#ifdef LJ_COMB_LB
#endif
#endif /* LJ_COMB_LB */
+ /* Determine the total scalar LJ force*r */
+ frLJ_S0 = gmx_simd_sub_r(FrLJ12_S0, FrLJ6_S0);
+#ifndef HALF_LJ
+ frLJ_S2 = gmx_simd_sub_r(FrLJ12_S2, FrLJ6_S2);
+#endif
+
+#if defined CALC_ENERGIES && !defined LJ_POT_SWITCH
+#ifndef LJ_FORCE_SWITCH
+ /* Calculate the LJ energies, with constant potential shift */
+ VLJ6_S0 = gmx_simd_mul_r(sixth_S, gmx_simd_fmadd_r(c6_S0, p6_cpot_S, FrLJ6_S0));
+#ifndef HALF_LJ
+ VLJ6_S2 = gmx_simd_mul_r(sixth_S, gmx_simd_fmadd_r(c6_S2, p6_cpot_S, FrLJ6_S2));
+#endif
+ VLJ12_S0 = gmx_simd_mul_r(twelveth_S, gmx_simd_fmadd_r(c12_S0, p12_cpot_S, FrLJ12_S0));
+#ifndef HALF_LJ
+ VLJ12_S2 = gmx_simd_mul_r(twelveth_S, gmx_simd_fmadd_r(c12_S2, p12_cpot_S, FrLJ12_S2));
+#endif
+#else
+
+#define v_fswitch_pr(rsw, rsw2, c0, c3, c4) gmx_simd_fmadd_r(gmx_simd_fmadd_r(c4, rsw, c3), gmx_simd_mul_r(rsw2, rsw), c0)
+
+ VLJ6_S0 = gmx_simd_mul_r(c6_S0, gmx_simd_fmadd_r(sixth_S, rinvsix_S0, v_fswitch_pr(rsw_S0, rsw2_S0, p6_6cpot_S, p6_vc3_S, p6_vc4_S)));
+#ifndef HALF_LJ
+ VLJ6_S2 = gmx_simd_mul_r(c6_S2, gmx_simd_fmadd_r(sixth_S, rinvsix_S2, v_fswitch_pr(rsw_S2, rsw2_S2, p6_6cpot_S, p6_vc3_S, p6_vc4_S)));
+#endif
+ VLJ12_S0 = gmx_simd_mul_r(c12_S0, gmx_simd_fmadd_r(twelveth_S, gmx_simd_mul_r(rinvsix_S0, rinvsix_S0), v_fswitch_pr(rsw_S0, rsw2_S0, p12_12cpot_S, p12_vc3_S, p12_vc4_S)));
+#ifndef HALF_LJ
+ VLJ12_S2 = gmx_simd_mul_r(c12_S2, gmx_simd_fmadd_r(twelveth_S, gmx_simd_mul_r(rinvsix_S2, rinvsix_S2), v_fswitch_pr(rsw_S2, rsw2_S2, p12_12cpot_S, p12_vc3_S, p12_vc4_S)));
+#endif
+#undef v_fswitch_pr
+#endif /* LJ_FORCE_SWITCH */
+
+ /* Add up the repulsion and dispersion */
+ VLJ_S0 = gmx_simd_sub_r(VLJ12_S0, VLJ6_S0);
+#ifndef HALF_LJ
+ VLJ_S2 = gmx_simd_sub_r(VLJ12_S2, VLJ6_S2);
+#endif
+#endif /* CALC_ENERGIES && !LJ_POT_SWITCH */
+
+#ifdef LJ_POT_SWITCH
+ /* We always need the potential, since it is needed for the force */
+ VLJ_S0 = gmx_simd_fnmadd_r(sixth_S, FrLJ6_S0, gmx_simd_mul_r(twelveth_S, FrLJ12_S0));
+#ifndef HALF_LJ
+ VLJ_S2 = gmx_simd_fnmadd_r(sixth_S, FrLJ6_S2, gmx_simd_mul_r(twelveth_S, FrLJ12_S2));
+#endif
+
+ {
+ gmx_simd_real_t sw_S0, dsw_S0;
+#ifndef HALF_LJ
+ gmx_simd_real_t sw_S2, dsw_S2;
+#endif
+
+#define switch_pr(rsw, rsw2, c3, c4, c5) gmx_simd_fmadd_r(gmx_simd_fmadd_r(gmx_simd_fmadd_r(c5, rsw, c4), rsw, c3), gmx_simd_mul_r(rsw2, rsw), one_S)
+#define dswitch_pr(rsw, rsw2, c2, c3, c4) gmx_simd_mul_r(gmx_simd_fmadd_r(gmx_simd_fmadd_r(c4, rsw, c3), rsw, c2), rsw2)
+
+ sw_S0 = switch_pr(rsw_S0, rsw2_S0, swV3_S, swV4_S, swV5_S);
+ dsw_S0 = dswitch_pr(rsw_S0, rsw2_S0, swF2_S, swF3_S, swF4_S);
+#ifndef HALF_LJ
+ sw_S2 = switch_pr(rsw_S2, rsw2_S2, swV3_S, swV4_S, swV5_S);
+ dsw_S2 = dswitch_pr(rsw_S2, rsw2_S2, swF2_S, swF3_S, swF4_S);
+#endif
+ frLJ_S0 = gmx_simd_fnmadd_r(gmx_simd_mul_r(dsw_S0, VLJ_S0), r_S0, gmx_simd_mul_r(sw_S0, frLJ_S0));
+#ifndef HALF_LJ
+ frLJ_S2 = gmx_simd_fnmadd_r(gmx_simd_mul_r(dsw_S2, VLJ_S2), r_S2, gmx_simd_mul_r(sw_S2, frLJ_S2));
+#endif
+#ifdef CALC_ENERGIES
+ VLJ_S0 = gmx_simd_mul_r(sw_S0, VLJ_S0);
+#ifndef HALF_LJ
+ VLJ_S2 = gmx_simd_mul_r(sw_S2, VLJ_S2);
+#endif
+#endif
+
+#undef switch_pr
+#undef dswitch_pr
+ }
+#endif /* LJ_POT_SWITCH */
+
+#if defined VDW_CUTOFF_CHECK
+ /* frLJ is multiplied later by rinvsq, which is masked for the Coulomb
+ * cut-off, but if the VdW cut-off is shorter, we need to mask with that.
+ */
+ frLJ_S0 = gmx_simd_blendzero_r(frLJ_S0, wco_vdw_S0);
+#ifndef HALF_LJ
+ frLJ_S2 = gmx_simd_blendzero_r(frLJ_S2, wco_vdw_S2);
+#endif
+#endif
+
+#ifdef CALC_ENERGIES
+ /* The potential shift should be removed for pairs beyond cut-off */
+ VLJ_S0 = gmx_simd_blendzero_r(VLJ_S0, wco_vdw_S0);
+#ifndef HALF_LJ
+ VLJ_S2 = gmx_simd_blendzero_r(VLJ_S2, wco_vdw_S2);
+#endif
+#endif
+
+#if defined CALC_ENERGIES && defined CHECK_EXCLS
+ /* The potential shift should be removed for excluded pairs */
+ VLJ_S0 = gmx_simd_blendzero_r(VLJ_S0, interact_S0);
+#ifndef HALF_LJ
+ VLJ_S2 = gmx_simd_blendzero_r(VLJ_S2, interact_S2);
+#endif
+#endif
+
+
#endif /* CALC_LJ */
#ifdef CALC_ENERGIES
#endif
#ifdef CALC_LJ
- /* Calculate the LJ energies */
- VLJ6_S0 = gmx_simd_mul_r(sixth_S, gmx_simd_sub_r(FrLJ6_S0, gmx_simd_mul_r(c6_S0, sh_invrc6_S)));
-#ifndef HALF_LJ
- VLJ6_S2 = gmx_simd_mul_r(sixth_S, gmx_simd_sub_r(FrLJ6_S2, gmx_simd_mul_r(c6_S2, sh_invrc6_S)));
-#endif
- VLJ12_S0 = gmx_simd_mul_r(twelveth_S, gmx_simd_sub_r(FrLJ12_S0, gmx_simd_mul_r(c12_S0, sh_invrc12_S)));
-#ifndef HALF_LJ
- VLJ12_S2 = gmx_simd_mul_r(twelveth_S, gmx_simd_sub_r(FrLJ12_S2, gmx_simd_mul_r(c12_S2, sh_invrc12_S)));
-#endif
-
- VLJ_S0 = gmx_simd_sub_r(VLJ12_S0, VLJ6_S0);
-#ifndef HALF_LJ
- VLJ_S2 = gmx_simd_sub_r(VLJ12_S2, VLJ6_S2);
-#endif
- /* The potential shift should be removed for pairs beyond cut-off */
- VLJ_S0 = gmx_simd_blendzero_r(VLJ_S0, wco_vdw_S0);
-#ifndef HALF_LJ
- VLJ_S2 = gmx_simd_blendzero_r(VLJ_S2, wco_vdw_S2);
-#endif
-#ifdef CHECK_EXCLS
- /* The potential shift should be removed for excluded pairs */
- VLJ_S0 = gmx_simd_blendzero_r(VLJ_S0, interact_S0);
-#ifndef HALF_LJ
- VLJ_S2 = gmx_simd_blendzero_r(VLJ_S2, interact_S2);
-#endif
-#endif
#ifndef ENERGY_GROUPS
Vvdwtot_S = gmx_simd_add_r(Vvdwtot_S,
#ifndef HALF_LJ
#ifdef CALC_LJ
#ifdef CALC_COULOMB
- fscal_S0 = gmx_simd_mul_r(rinvsq_S0,
- gmx_simd_add_r(frcoul_S0,
- gmx_simd_sub_r(FrLJ12_S0, FrLJ6_S0)));
+ fscal_S0 = gmx_simd_mul_r(rinvsq_S0, gmx_simd_add_r(frcoul_S0, frLJ_S0));
#else
- fscal_S0 = gmx_simd_mul_r(rinvsq_S0,
- (
- gmx_simd_sub_r(FrLJ12_S0, FrLJ6_S0)));
+ fscal_S0 = gmx_simd_mul_r(rinvsq_S0, frLJ_S0);
#endif
#else
fscal_S0 = gmx_simd_mul_r(rinvsq_S0, frcoul_S0);
#endif /* CALC_LJ */
#if defined CALC_LJ && !defined HALF_LJ
#ifdef CALC_COULOMB
- fscal_S2 = gmx_simd_mul_r(rinvsq_S2,
- gmx_simd_add_r(frcoul_S2,
- gmx_simd_sub_r(FrLJ12_S2, FrLJ6_S2)));
+ fscal_S2 = gmx_simd_mul_r(rinvsq_S2, gmx_simd_add_r(frcoul_S2, frLJ_S2));
#else
- fscal_S2 = gmx_simd_mul_r(rinvsq_S2,
- (
- gmx_simd_sub_r(FrLJ12_S2, FrLJ6_S2)));
+ fscal_S2 = gmx_simd_mul_r(rinvsq_S2, frLJ_S2);
#endif
#else
/* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
#undef wco_vdw_S0
#undef wco_vdw_S2
-#undef CUTOFF_BLENDV
+#undef NBNXN_CUTOFF_USE_BLENDV
#undef EXCL_FORCES