/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2009, The GROMACS Development Team
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
- * David van der Spoel, Berk Hess, Erik Lindahl, and including many
- * others, as listed in the AUTHORS file in the top-level source
- * directory and at http://www.gromacs.org.
+ * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
*
* GROMACS is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
*/
-/* When calculating RF or Ewald interactions we calculate the electrostatic
+/* When calculating RF or Ewald interactions we calculate the electrostatic/LJ
* forces on excluded atom pairs here in the non-bonded loops.
* But when energies and/or virial is required we calculate them
* separately to as then it is easier to separate the energy and virial
* contributions.
*/
-#if defined CHECK_EXCLS && defined CALC_COULOMB
+#if defined CHECK_EXCLS && (defined CALC_COULOMB || defined LJ_EWALD_GEOM)
#define EXCL_FORCES
#endif
/* Without exclusions and energies we only need to mask the cut-off,
* this can be faster with blendv.
*/
-#if !(defined CHECK_EXCLS || defined CALC_ENERGIES) && defined GMX_SIMD_HAVE_BLENDV && !defined COUNT_PAIRS
+#if !(defined CHECK_EXCLS || defined CALC_ENERGIES || defined LJ_EWALD_GEOM) && defined GMX_SIMD_HAVE_BLENDV
/* With RF and tabulated Coulomb we replace cmp+and with sub+blendv.
* With gcc this is slower, except for RF on Sandy Bridge.
* Tested with gcc 4.6.2, 4.6.3 and 4.7.1.
*/
-#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_X86_AVX_256))
-#define CUTOFF_BLENDV
+#if (defined CALC_COUL_RF || defined CALC_COUL_TAB) && (!defined __GNUC__ || (defined CALC_COUL_RF && defined GMX_SIMD_X86_AVX_256_OR_HIGHER))
+#define NBNXN_CUTOFF_USE_BLENDV
#endif
/* With analytical Ewald we replace cmp+and+and with sub+blendv+blendv.
* This is only faster with icc on Sandy Bridge (PS kernel slower than gcc 4.7).
* Tested with icc 13.
*/
-#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_X86_AVX_256
-#define CUTOFF_BLENDV
+#if defined CALC_COUL_EWALD && defined __INTEL_COMPILER && defined GMX_SIMD_X86_AVX_256_OR_HIGHER
+#define NBNXN_CUTOFF_USE_BLENDV
#endif
#endif
#ifdef CHECK_EXCLS
/* Interaction (non-exclusion) mask of all 1's or 0's */
- gmx_mm_pb interact_S0;
- gmx_mm_pb interact_S2;
-#endif
-
- gmx_mm_pr jx_S, jy_S, jz_S;
- gmx_mm_pr dx_S0, dy_S0, dz_S0;
- gmx_mm_pr dx_S2, dy_S2, dz_S2;
- gmx_mm_pr tx_S0, ty_S0, tz_S0;
- gmx_mm_pr tx_S2, ty_S2, tz_S2;
- gmx_mm_pr rsq_S0, rinv_S0, rinvsq_S0;
- gmx_mm_pr rsq_S2, rinv_S2, rinvsq_S2;
-#ifndef CUTOFF_BLENDV
+ gmx_simd_bool_t interact_S0;
+ gmx_simd_bool_t interact_S2;
+#endif
+
+ gmx_simd_real_t jx_S, jy_S, jz_S;
+ gmx_simd_real_t dx_S0, dy_S0, dz_S0;
+ gmx_simd_real_t dx_S2, dy_S2, dz_S2;
+ gmx_simd_real_t tx_S0, ty_S0, tz_S0;
+ gmx_simd_real_t tx_S2, ty_S2, tz_S2;
+ gmx_simd_real_t rsq_S0, rinv_S0, rinvsq_S0;
+ gmx_simd_real_t rsq_S2, rinv_S2, rinvsq_S2;
+#ifndef NBNXN_CUTOFF_USE_BLENDV
/* wco: within cut-off, mask of all 1's or 0's */
- gmx_mm_pb wco_S0;
- gmx_mm_pb wco_S2;
+ gmx_simd_bool_t wco_S0;
+ gmx_simd_bool_t wco_S2;
#endif
#ifdef VDW_CUTOFF_CHECK
- gmx_mm_pb wco_vdw_S0;
+ gmx_simd_bool_t wco_vdw_S0;
#ifndef HALF_LJ
- gmx_mm_pb wco_vdw_S2;
+ gmx_simd_bool_t wco_vdw_S2;
#endif
#endif
+
+#if (defined CALC_COULOMB && defined CALC_COUL_TAB) || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
+ gmx_simd_real_t r_S0;
+#if (defined CALC_COULOMB && defined CALC_COUL_TAB) || !defined HALF_LJ
+ gmx_simd_real_t r_S2;
+#endif
+#endif
+
+#if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
+ gmx_simd_real_t rsw_S0, rsw2_S0;
+#ifndef HALF_LJ
+ gmx_simd_real_t rsw_S2, rsw2_S2;
+#endif
+#endif
+
#ifdef CALC_COULOMB
#ifdef CHECK_EXCLS
/* 1/r masked with the interaction mask */
- gmx_mm_pr rinv_ex_S0;
- gmx_mm_pr rinv_ex_S2;
+ gmx_simd_real_t rinv_ex_S0;
+ gmx_simd_real_t rinv_ex_S2;
#endif
- gmx_mm_pr jq_S;
- gmx_mm_pr qq_S0;
- gmx_mm_pr qq_S2;
+ gmx_simd_real_t jq_S;
+ gmx_simd_real_t qq_S0;
+ gmx_simd_real_t qq_S2;
#ifdef CALC_COUL_TAB
/* The force (PME mesh force) we need to subtract from 1/r^2 */
- gmx_mm_pr fsub_S0;
- gmx_mm_pr fsub_S2;
+ gmx_simd_real_t fsub_S0;
+ gmx_simd_real_t fsub_S2;
#endif
#ifdef CALC_COUL_EWALD
- gmx_mm_pr brsq_S0, brsq_S2;
- gmx_mm_pr ewcorr_S0, ewcorr_S2;
+ gmx_simd_real_t brsq_S0, brsq_S2;
+ gmx_simd_real_t ewcorr_S0, ewcorr_S2;
#endif
/* frcoul = (1/r - fsub)*r */
- gmx_mm_pr frcoul_S0;
- gmx_mm_pr frcoul_S2;
+ gmx_simd_real_t frcoul_S0;
+ gmx_simd_real_t frcoul_S2;
#ifdef CALC_COUL_TAB
/* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
- gmx_mm_pr r_S0, rs_S0, rf_S0, frac_S0;
- gmx_mm_pr r_S2, rs_S2, rf_S2, frac_S2;
+ gmx_simd_real_t rs_S0, rf_S0, frac_S0;
+ gmx_simd_real_t rs_S2, rf_S2, frac_S2;
/* Table index: rs truncated to an int */
- gmx_epi32 ti_S0, ti_S2;
+ gmx_simd_int32_t ti_S0, ti_S2;
/* Linear force table values */
- gmx_mm_pr ctab0_S0, ctab1_S0;
- gmx_mm_pr ctab0_S2, ctab1_S2;
+ gmx_simd_real_t ctab0_S0, ctab1_S0;
+ gmx_simd_real_t ctab0_S2, ctab1_S2;
#ifdef CALC_ENERGIES
/* Quadratic energy table value */
- gmx_mm_pr ctabv_S0;
- gmx_mm_pr ctabv_S2;
+ gmx_simd_real_t ctabv_S0;
+ gmx_simd_real_t ctabv_S2;
#endif
#endif
#if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
/* The potential (PME mesh) we need to subtract from 1/r */
- gmx_mm_pr vc_sub_S0;
- gmx_mm_pr vc_sub_S2;
+ gmx_simd_real_t vc_sub_S0;
+ gmx_simd_real_t vc_sub_S2;
#endif
#ifdef CALC_ENERGIES
/* Electrostatic potential */
- gmx_mm_pr vcoul_S0;
- gmx_mm_pr vcoul_S2;
+ gmx_simd_real_t vcoul_S0;
+ gmx_simd_real_t vcoul_S2;
#endif
#endif
/* The force times 1/r */
- gmx_mm_pr fscal_S0;
- gmx_mm_pr fscal_S2;
+ gmx_simd_real_t fscal_S0;
+ gmx_simd_real_t fscal_S2;
#ifdef CALC_LJ
#ifdef LJ_COMB_LB
/* LJ sigma_j/2 and sqrt(epsilon_j) */
- gmx_mm_pr hsig_j_S, seps_j_S;
+ gmx_simd_real_t hsig_j_S, seps_j_S;
/* LJ sigma_ij and epsilon_ij */
- gmx_mm_pr sig_S0, eps_S0;
+ gmx_simd_real_t sig_S0, eps_S0;
#ifndef HALF_LJ
- gmx_mm_pr sig_S2, eps_S2;
+ gmx_simd_real_t sig_S2, eps_S2;
#endif
#ifdef CALC_ENERGIES
- gmx_mm_pr sig2_S0, sig6_S0;
+ gmx_simd_real_t sig2_S0, sig6_S0;
#ifndef HALF_LJ
- gmx_mm_pr sig2_S2, sig6_S2;
+ gmx_simd_real_t sig2_S2, sig6_S2;
#endif
#endif /* LJ_COMB_LB */
#endif /* CALC_LJ */
#ifdef LJ_COMB_GEOM
- gmx_mm_pr c6s_j_S, c12s_j_S;
+ gmx_simd_real_t c6s_j_S, c12s_j_S;
#endif
-#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM
/* Index for loading LJ parameters, complicated when interleaving */
int aj2;
#endif
-#ifndef FIX_LJ_C
- /* LJ C6 and C12 parameters, used with geometric comb. rule */
- gmx_mm_pr c6_S0, c12_S0;
-#ifndef HALF_LJ
- gmx_mm_pr c6_S2, c12_S2;
-#endif
-#endif
-
/* Intermediate variables for LJ calculation */
#ifndef LJ_COMB_LB
- gmx_mm_pr rinvsix_S0;
+ gmx_simd_real_t rinvsix_S0;
#ifndef HALF_LJ
- gmx_mm_pr rinvsix_S2;
+ gmx_simd_real_t rinvsix_S2;
#endif
#endif
#ifdef LJ_COMB_LB
- gmx_mm_pr sir_S0, sir2_S0, sir6_S0;
+ gmx_simd_real_t sir_S0, sir2_S0, sir6_S0;
#ifndef HALF_LJ
- gmx_mm_pr sir_S2, sir2_S2, sir6_S2;
+ gmx_simd_real_t sir_S2, sir2_S2, sir6_S2;
#endif
#endif
- gmx_mm_pr FrLJ6_S0, FrLJ12_S0;
-#ifndef HALF_LJ
- gmx_mm_pr FrLJ6_S2, FrLJ12_S2;
-#endif
-#ifdef CALC_ENERGIES
- gmx_mm_pr VLJ6_S0, VLJ12_S0, VLJ_S0;
+ gmx_simd_real_t FrLJ6_S0, FrLJ12_S0, frLJ_S0;
#ifndef HALF_LJ
- gmx_mm_pr VLJ6_S2, VLJ12_S2, VLJ_S2;
-#endif
+ gmx_simd_real_t FrLJ6_S2, FrLJ12_S2, frLJ_S2;
#endif
#endif /* CALC_LJ */
/* Atom indices (of the first atom in the cluster) */
aj = cj*UNROLLJ;
-#if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB)
-#if UNROLLJ == STRIDE
+#if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM)
aj2 = aj*2;
-#else
- aj2 = (cj>>1)*2*STRIDE + (cj & 1)*UNROLLJ;
#endif
-#endif
-#if UNROLLJ == STRIDE
ajx = aj*DIM;
-#else
- ajx = (cj>>1)*DIM*STRIDE + (cj & 1)*UNROLLJ;
-#endif
ajy = ajx + STRIDE;
ajz = ajy + STRIDE;
#ifdef CHECK_EXCLS
- gmx_load_simd_2xnn_interactions(l_cj[cjind].excl, filter_S0, filter_S2, &interact_S0, &interact_S2);
+ gmx_load_simd_2xnn_interactions(l_cj[cjind].excl,
+ filter_S0, filter_S2,
+ &interact_S0, &interact_S2);
#endif /* CHECK_EXCLS */
/* load j atom coordinates */
gmx_loaddh_pr(&jz_S, x+ajz);
/* Calculate distance */
- dx_S0 = gmx_sub_pr(ix_S0, jx_S);
- dy_S0 = gmx_sub_pr(iy_S0, jy_S);
- dz_S0 = gmx_sub_pr(iz_S0, jz_S);
- dx_S2 = gmx_sub_pr(ix_S2, jx_S);
- dy_S2 = gmx_sub_pr(iy_S2, jy_S);
- dz_S2 = gmx_sub_pr(iz_S2, jz_S);
+ dx_S0 = gmx_simd_sub_r(ix_S0, jx_S);
+ dy_S0 = gmx_simd_sub_r(iy_S0, jy_S);
+ dz_S0 = gmx_simd_sub_r(iz_S0, jz_S);
+ dx_S2 = gmx_simd_sub_r(ix_S2, jx_S);
+ dy_S2 = gmx_simd_sub_r(iy_S2, jy_S);
+ dz_S2 = gmx_simd_sub_r(iz_S2, jz_S);
/* rsq = dx*dx+dy*dy+dz*dz */
- rsq_S0 = gmx_calc_rsq_pr(dx_S0, dy_S0, dz_S0);
- rsq_S2 = gmx_calc_rsq_pr(dx_S2, dy_S2, dz_S2);
+ rsq_S0 = gmx_simd_calc_rsq_r(dx_S0, dy_S0, dz_S0);
+ rsq_S2 = gmx_simd_calc_rsq_r(dx_S2, dy_S2, dz_S2);
-#ifndef CUTOFF_BLENDV
- wco_S0 = gmx_cmplt_pr(rsq_S0, rc2_S);
- wco_S2 = gmx_cmplt_pr(rsq_S2, rc2_S);
+#ifndef NBNXN_CUTOFF_USE_BLENDV
+ wco_S0 = gmx_simd_cmplt_r(rsq_S0, rc2_S);
+ wco_S2 = gmx_simd_cmplt_r(rsq_S2, rc2_S);
#endif
#ifdef CHECK_EXCLS
#if UNROLLJ == UNROLLI
if (cj == ci_sh)
{
- wco_S0 = gmx_and_pb(wco_S0, diagonal_mask_S0);
- wco_S2 = gmx_and_pb(wco_S2, diagonal_mask_S2);
+ wco_S0 = gmx_simd_and_b(wco_S0, diagonal_mask_S0);
+ wco_S2 = gmx_simd_and_b(wco_S2, diagonal_mask_S2);
}
#else
#if UNROLLJ == 2*UNROLLI
if (cj*2 == ci_sh)
{
- wco_S0 = gmx_and_pb(wco_S0, diagonal_mask0_S0);
- wco_S2 = gmx_and_pb(wco_S2, diagonal_mask0_S2);
+ wco_S0 = gmx_simd_and_b(wco_S0, diagonal_mask0_S0);
+ wco_S2 = gmx_simd_and_b(wco_S2, diagonal_mask0_S2);
}
else if (cj*2 + 1 == ci_sh)
{
- wco_S0 = gmx_and_pb(wco_S0, diagonal_mask1_S0);
- wco_S2 = gmx_and_pb(wco_S2, diagonal_mask1_S2);
+ wco_S0 = gmx_simd_and_b(wco_S0, diagonal_mask1_S0);
+ wco_S2 = gmx_simd_and_b(wco_S2, diagonal_mask1_S2);
}
#else
#error "only UNROLLJ == UNROLLI*(1 or 2) currently supported in 2xnn kernels"
#endif
#else /* EXCL_FORCES */
/* No exclusion forces: remove all excluded atom pairs from the list */
- wco_S0 = gmx_and_pb(wco_S0, interact_S0);
- wco_S2 = gmx_and_pb(wco_S2, interact_S2);
+ wco_S0 = gmx_simd_and_b(wco_S0, interact_S0);
+ wco_S2 = gmx_simd_and_b(wco_S2, interact_S2);
#endif
#endif
#ifdef COUNT_PAIRS
{
int i, j;
- real tmpa[2*GMX_SIMD_WIDTH_HERE], *tmp;
- tmp = gmx_simd_align_real(tmpa);
+ real tmpa[2*GMX_SIMD_REAL_WIDTH], *tmp;
+ tmp = gmx_simd_align_r(tmpa);
for (i = 0; i < UNROLLI; i += 2)
{
- gmx_store_pr(tmp, i == 0 ? wco_S0 : wco_S2);
+ gmx_simd_store_r(tmp, gmx_simd_sub_r(rc2_S, i == 0 ? rsq_S0 : rsq_S2));
for (j = 0; j < 2*UNROLLJ; j++)
{
- if (!(tmp[j] == 0))
+ if (tmp[j] >= 0)
{
npair++;
}
#ifdef CHECK_EXCLS
/* For excluded pairs add a small number to avoid r^-6 = NaN */
- rsq_S0 = gmx_masknot_add_pr(interact_S0, rsq_S0, avoid_sing_S);
- rsq_S2 = gmx_masknot_add_pr(interact_S2, rsq_S2, avoid_sing_S);
+ rsq_S0 = gmx_simd_add_r(rsq_S0, gmx_simd_blendnotzero_r(avoid_sing_S, interact_S0));
+ rsq_S2 = gmx_simd_add_r(rsq_S2, gmx_simd_blendnotzero_r(avoid_sing_S, interact_S2));
#endif
/* Calculate 1/r */
- rinv_S0 = gmx_invsqrt_pr(rsq_S0);
- rinv_S2 = gmx_invsqrt_pr(rsq_S2);
+ rinv_S0 = gmx_simd_invsqrt_r(rsq_S0);
+ rinv_S2 = gmx_simd_invsqrt_r(rsq_S2);
#ifdef CALC_COULOMB
/* Load parameters for j atom */
gmx_loaddh_pr(&jq_S, q+aj);
- qq_S0 = gmx_mul_pr(iq_S0, jq_S);
- qq_S2 = gmx_mul_pr(iq_S2, jq_S);
+ qq_S0 = gmx_simd_mul_r(iq_S0, jq_S);
+ qq_S2 = gmx_simd_mul_r(iq_S2, jq_S);
#endif
#ifdef CALC_LJ
#if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
+ gmx_simd_real_t c6_S0, c12_S0;
load_lj_pair_params2(nbfp0, nbfp1, type, aj, &c6_S0, &c12_S0);
#ifndef HALF_LJ
+ gmx_simd_real_t c6_S2, c12_S2;
load_lj_pair_params2(nbfp2, nbfp3, type, aj, &c6_S2, &c12_S2);
#endif
#endif /* not defined any LJ rule */
#ifdef LJ_COMB_GEOM
gmx_loaddh_pr(&c6s_j_S, ljc+aj2+0);
gmx_loaddh_pr(&c12s_j_S, ljc+aj2+STRIDE);
- c6_S0 = gmx_mul_pr(c6s_S0, c6s_j_S );
+ gmx_simd_real_t c6_S0 = gmx_simd_mul_r(c6s_S0, c6s_j_S );
#ifndef HALF_LJ
- c6_S2 = gmx_mul_pr(c6s_S2, c6s_j_S );
+ gmx_simd_real_t c6_S2 = gmx_simd_mul_r(c6s_S2, c6s_j_S );
#endif
- c12_S0 = gmx_mul_pr(c12s_S0, c12s_j_S);
+ gmx_simd_real_t c12_S0 = gmx_simd_mul_r(c12s_S0, c12s_j_S);
#ifndef HALF_LJ
- c12_S2 = gmx_mul_pr(c12s_S2, c12s_j_S);
+ gmx_simd_real_t c12_S2 = gmx_simd_mul_r(c12s_S2, c12s_j_S);
#endif
#endif /* LJ_COMB_GEOM */
gmx_loaddh_pr(&hsig_j_S, ljc+aj2+0);
gmx_loaddh_pr(&seps_j_S, ljc+aj2+STRIDE);
- sig_S0 = gmx_add_pr(hsig_i_S0, hsig_j_S);
- eps_S0 = gmx_mul_pr(seps_i_S0, seps_j_S);
+ sig_S0 = gmx_simd_add_r(hsig_i_S0, hsig_j_S);
+ eps_S0 = gmx_simd_mul_r(seps_i_S0, seps_j_S);
#ifndef HALF_LJ
- sig_S2 = gmx_add_pr(hsig_i_S2, hsig_j_S);
- eps_S2 = gmx_mul_pr(seps_i_S2, seps_j_S);
+ sig_S2 = gmx_simd_add_r(hsig_i_S2, hsig_j_S);
+ eps_S2 = gmx_simd_mul_r(seps_i_S2, seps_j_S);
#endif
#endif /* LJ_COMB_LB */
#endif /* CALC_LJ */
-#ifndef CUTOFF_BLENDV
- rinv_S0 = gmx_blendzero_pr(rinv_S0, wco_S0);
- rinv_S2 = gmx_blendzero_pr(rinv_S2, wco_S2);
+#ifndef NBNXN_CUTOFF_USE_BLENDV
+ rinv_S0 = gmx_simd_blendzero_r(rinv_S0, wco_S0);
+ rinv_S2 = gmx_simd_blendzero_r(rinv_S2, wco_S2);
#else
+ /* This needs to be modified: It makes assumptions about the internal storage
+ * of the SIMD representation, in particular that the blendv instruction always
+ * selects based on the sign bit. If the performance is really critical, it
+ * should be turned into a function that is platform-specific.
+ */
/* We only need to mask for the cut-off: blendv is faster */
- rinv_S0 = gmx_blendv_pr(rinv_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0));
- rinv_S2 = gmx_blendv_pr(rinv_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2));
+ rinv_S0 = gmx_simd_blendv_r(rinv_S0, zero_S, gmx_simd_sub_r(rc2_S, rsq_S0));
+ rinv_S2 = gmx_simd_blendv_r(rinv_S2, zero_S, gmx_simd_sub_r(rc2_S, rsq_S2));
#endif
- rinvsq_S0 = gmx_mul_pr(rinv_S0, rinv_S0);
- rinvsq_S2 = gmx_mul_pr(rinv_S2, rinv_S2);
+ rinvsq_S0 = gmx_simd_mul_r(rinv_S0, rinv_S0);
+ rinvsq_S2 = gmx_simd_mul_r(rinv_S2, rinv_S2);
#ifdef CALC_COULOMB
/* Note that here we calculate force*r, not the usual force/r.
#ifdef EXCL_FORCES
/* Only add 1/r for non-excluded atom pairs */
- rinv_ex_S0 = gmx_blendzero_pr(rinv_S0, interact_S0);
- rinv_ex_S2 = gmx_blendzero_pr(rinv_S2, interact_S2);
+ rinv_ex_S0 = gmx_simd_blendzero_r(rinv_S0, interact_S0);
+ rinv_ex_S2 = gmx_simd_blendzero_r(rinv_S2, interact_S2);
#else
/* No exclusion forces, we always need 1/r */
#define rinv_ex_S0 rinv_S0
#ifdef CALC_COUL_RF
/* Electrostatic interactions */
- frcoul_S0 = gmx_mul_pr(qq_S0, gmx_madd_pr(rsq_S0, mrc_3_S, rinv_ex_S0));
- frcoul_S2 = gmx_mul_pr(qq_S2, gmx_madd_pr(rsq_S2, mrc_3_S, rinv_ex_S2));
+ frcoul_S0 = gmx_simd_mul_r(qq_S0, gmx_simd_fmadd_r(rsq_S0, mrc_3_S, rinv_ex_S0));
+ frcoul_S2 = gmx_simd_mul_r(qq_S2, gmx_simd_fmadd_r(rsq_S2, mrc_3_S, rinv_ex_S2));
#ifdef CALC_ENERGIES
- vcoul_S0 = gmx_mul_pr(qq_S0, gmx_add_pr(rinv_ex_S0, gmx_add_pr(gmx_mul_pr(rsq_S0, hrc_3_S), moh_rc_S)));
- vcoul_S2 = gmx_mul_pr(qq_S2, gmx_add_pr(rinv_ex_S2, gmx_add_pr(gmx_mul_pr(rsq_S2, hrc_3_S), moh_rc_S)));
+ vcoul_S0 = gmx_simd_mul_r(qq_S0, gmx_simd_add_r(rinv_ex_S0, gmx_simd_add_r(gmx_simd_mul_r(rsq_S0, hrc_3_S), moh_rc_S)));
+ vcoul_S2 = gmx_simd_mul_r(qq_S2, gmx_simd_add_r(rinv_ex_S2, gmx_simd_add_r(gmx_simd_mul_r(rsq_S2, hrc_3_S), moh_rc_S)));
#endif
#endif
/* We need to mask (or limit) rsq for the cut-off,
* as large distances can cause an overflow in gmx_pmecorrF/V.
*/
-#ifndef CUTOFF_BLENDV
- brsq_S0 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S0, wco_S0));
- brsq_S2 = gmx_mul_pr(beta2_S, gmx_blendzero_pr(rsq_S2, wco_S2));
+#ifndef NBNXN_CUTOFF_USE_BLENDV
+ brsq_S0 = gmx_simd_mul_r(beta2_S, gmx_simd_blendzero_r(rsq_S0, wco_S0));
+ brsq_S2 = gmx_simd_mul_r(beta2_S, gmx_simd_blendzero_r(rsq_S2, wco_S2));
#else
/* Strangely, putting mul on a separate line is slower (icc 13) */
- brsq_S0 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S0, zero_S, gmx_sub_pr(rc2_S, rsq_S0)));
- brsq_S2 = gmx_mul_pr(beta2_S, gmx_blendv_pr(rsq_S2, zero_S, gmx_sub_pr(rc2_S, rsq_S2)));
+ brsq_S0 = gmx_simd_mul_r(beta2_S, gmx_simd_blendv_r(rsq_S0, zero_S, gmx_simd_sub_r(rc2_S, rsq_S0)));
+ brsq_S2 = gmx_simd_mul_r(beta2_S, gmx_simd_blendv_r(rsq_S2, zero_S, gmx_simd_sub_r(rc2_S, rsq_S2)));
#endif
- ewcorr_S0 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S0), beta_S);
- ewcorr_S2 = gmx_mul_pr(gmx_pmecorrF_pr(brsq_S2), beta_S);
- frcoul_S0 = gmx_mul_pr(qq_S0, gmx_madd_pr(ewcorr_S0, brsq_S0, rinv_ex_S0));
- frcoul_S2 = gmx_mul_pr(qq_S2, gmx_madd_pr(ewcorr_S2, brsq_S2, rinv_ex_S2));
+ ewcorr_S0 = gmx_simd_mul_r(gmx_simd_pmecorrF_r(brsq_S0), beta_S);
+ ewcorr_S2 = gmx_simd_mul_r(gmx_simd_pmecorrF_r(brsq_S2), beta_S);
+ frcoul_S0 = gmx_simd_mul_r(qq_S0, gmx_simd_fmadd_r(ewcorr_S0, brsq_S0, rinv_ex_S0));
+ frcoul_S2 = gmx_simd_mul_r(qq_S2, gmx_simd_fmadd_r(ewcorr_S2, brsq_S2, rinv_ex_S2));
#ifdef CALC_ENERGIES
- vc_sub_S0 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S0), beta_S);
- vc_sub_S2 = gmx_mul_pr(gmx_pmecorrV_pr(brsq_S2), beta_S);
+ vc_sub_S0 = gmx_simd_mul_r(gmx_simd_pmecorrV_r(brsq_S0), beta_S);
+ vc_sub_S2 = gmx_simd_mul_r(gmx_simd_pmecorrV_r(brsq_S2), beta_S);
#endif
#endif /* CALC_COUL_EWALD */
#ifdef CALC_COUL_TAB
/* Electrostatic interactions */
- r_S0 = gmx_mul_pr(rsq_S0, rinv_S0);
- r_S2 = gmx_mul_pr(rsq_S2, rinv_S2);
+ r_S0 = gmx_simd_mul_r(rsq_S0, rinv_S0);
+ r_S2 = gmx_simd_mul_r(rsq_S2, rinv_S2);
/* Convert r to scaled table units */
- rs_S0 = gmx_mul_pr(r_S0, invtsp_S);
- rs_S2 = gmx_mul_pr(r_S2, invtsp_S);
+ rs_S0 = gmx_simd_mul_r(r_S0, invtsp_S);
+ rs_S2 = gmx_simd_mul_r(r_S2, invtsp_S);
/* Truncate scaled r to an int */
- ti_S0 = gmx_cvttpr_epi32(rs_S0);
- ti_S2 = gmx_cvttpr_epi32(rs_S2);
-#ifdef GMX_SIMD_HAVE_FLOOR
- rf_S0 = gmx_floor_pr(rs_S0);
- rf_S2 = gmx_floor_pr(rs_S2);
+ ti_S0 = gmx_simd_cvtt_r2i(rs_S0);
+ ti_S2 = gmx_simd_cvtt_r2i(rs_S2);
+#ifdef GMX_SIMD_HAVE_TRUNC
+ rf_S0 = gmx_simd_trunc_r(rs_S0);
+ rf_S2 = gmx_simd_trunc_r(rs_S2);
#else
- rf_S0 = gmx_cvtepi32_pr(ti_S0);
- rf_S2 = gmx_cvtepi32_pr(ti_S2);
+ rf_S0 = gmx_simd_cvt_i2r(ti_S0);
+ rf_S2 = gmx_simd_cvt_i2r(ti_S2);
#endif
- frac_S0 = gmx_sub_pr(rs_S0, rf_S0);
- frac_S2 = gmx_sub_pr(rs_S2, rf_S2);
+ frac_S0 = gmx_simd_sub_r(rs_S0, rf_S0);
+ frac_S2 = gmx_simd_sub_r(rs_S2, rf_S2);
/* Load and interpolate table forces and possibly energies.
* Force and energy can be combined in one table, stride 4: FDV0
load_table_f_v(tab_coul_F, tab_coul_V, ti_S2, ti2, &ctab0_S2, &ctab1_S2, &ctabv_S2);
#endif
#endif
- fsub_S0 = gmx_add_pr(ctab0_S0, gmx_mul_pr(frac_S0, ctab1_S0));
- fsub_S2 = gmx_add_pr(ctab0_S2, gmx_mul_pr(frac_S2, ctab1_S2));
- frcoul_S0 = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, gmx_mul_pr(fsub_S0, r_S0)));
- frcoul_S2 = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, gmx_mul_pr(fsub_S2, r_S2)));
+ fsub_S0 = gmx_simd_add_r(ctab0_S0, gmx_simd_mul_r(frac_S0, ctab1_S0));
+ fsub_S2 = gmx_simd_add_r(ctab0_S2, gmx_simd_mul_r(frac_S2, ctab1_S2));
+ frcoul_S0 = gmx_simd_mul_r(qq_S0, gmx_simd_sub_r(rinv_ex_S0, gmx_simd_mul_r(fsub_S0, r_S0)));
+ frcoul_S2 = gmx_simd_mul_r(qq_S2, gmx_simd_sub_r(rinv_ex_S2, gmx_simd_mul_r(fsub_S2, r_S2)));
#ifdef CALC_ENERGIES
- vc_sub_S0 = gmx_add_pr(ctabv_S0, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S0), gmx_add_pr(ctab0_S0, fsub_S0)));
- vc_sub_S2 = gmx_add_pr(ctabv_S2, gmx_mul_pr(gmx_mul_pr(mhalfsp_S, frac_S2), gmx_add_pr(ctab0_S2, fsub_S2)));
+ vc_sub_S0 = gmx_simd_add_r(ctabv_S0, gmx_simd_mul_r(gmx_simd_mul_r(mhalfsp_S, frac_S0), gmx_simd_add_r(ctab0_S0, fsub_S0)));
+ vc_sub_S2 = gmx_simd_add_r(ctabv_S2, gmx_simd_mul_r(gmx_simd_mul_r(mhalfsp_S, frac_S2), gmx_simd_add_r(ctab0_S2, fsub_S2)));
#endif
#endif /* CALC_COUL_TAB */
#ifndef NO_SHIFT_EWALD
/* Add Ewald potential shift to vc_sub for convenience */
#ifdef CHECK_EXCLS
- vc_sub_S0 = gmx_add_pr(vc_sub_S0, gmx_blendzero_pr(sh_ewald_S, interact_S0));
- vc_sub_S2 = gmx_add_pr(vc_sub_S2, gmx_blendzero_pr(sh_ewald_S, interact_S2));
+ vc_sub_S0 = gmx_simd_add_r(vc_sub_S0, gmx_simd_blendzero_r(sh_ewald_S, interact_S0));
+ vc_sub_S2 = gmx_simd_add_r(vc_sub_S2, gmx_simd_blendzero_r(sh_ewald_S, interact_S2));
#else
- vc_sub_S0 = gmx_add_pr(vc_sub_S0, sh_ewald_S);
- vc_sub_S2 = gmx_add_pr(vc_sub_S2, sh_ewald_S);
+ vc_sub_S0 = gmx_simd_add_r(vc_sub_S0, sh_ewald_S);
+ vc_sub_S2 = gmx_simd_add_r(vc_sub_S2, sh_ewald_S);
#endif
#endif
- vcoul_S0 = gmx_mul_pr(qq_S0, gmx_sub_pr(rinv_ex_S0, vc_sub_S0));
- vcoul_S2 = gmx_mul_pr(qq_S2, gmx_sub_pr(rinv_ex_S2, vc_sub_S2));
+ vcoul_S0 = gmx_simd_mul_r(qq_S0, gmx_simd_sub_r(rinv_ex_S0, vc_sub_S0));
+ vcoul_S2 = gmx_simd_mul_r(qq_S2, gmx_simd_sub_r(rinv_ex_S2, vc_sub_S2));
#endif
#ifdef CALC_ENERGIES
/* Mask energy for cut-off and diagonal */
- vcoul_S0 = gmx_blendzero_pr(vcoul_S0, wco_S0);
- vcoul_S2 = gmx_blendzero_pr(vcoul_S2, wco_S2);
+ vcoul_S0 = gmx_simd_blendzero_r(vcoul_S0, wco_S0);
+ vcoul_S2 = gmx_simd_blendzero_r(vcoul_S2, wco_S2);
#endif
#endif /* CALC_COULOMB */
/* Lennard-Jones interaction */
#ifdef VDW_CUTOFF_CHECK
- wco_vdw_S0 = gmx_cmplt_pr(rsq_S0, rcvdw2_S);
+ wco_vdw_S0 = gmx_simd_cmplt_r(rsq_S0, rcvdw2_S);
#ifndef HALF_LJ
- wco_vdw_S2 = gmx_cmplt_pr(rsq_S2, rcvdw2_S);
+ wco_vdw_S2 = gmx_simd_cmplt_r(rsq_S2, rcvdw2_S);
#endif
#else
/* Same cut-off for Coulomb and VdW, reuse the registers */
#endif
#ifndef LJ_COMB_LB
- rinvsix_S0 = gmx_mul_pr(rinvsq_S0, gmx_mul_pr(rinvsq_S0, rinvsq_S0));
+ rinvsix_S0 = gmx_simd_mul_r(rinvsq_S0, gmx_simd_mul_r(rinvsq_S0, rinvsq_S0));
#ifdef EXCL_FORCES
- rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, interact_S0);
+ rinvsix_S0 = gmx_simd_blendzero_r(rinvsix_S0, interact_S0);
#endif
#ifndef HALF_LJ
- rinvsix_S2 = gmx_mul_pr(rinvsq_S2, gmx_mul_pr(rinvsq_S2, rinvsq_S2));
+ rinvsix_S2 = gmx_simd_mul_r(rinvsq_S2, gmx_simd_mul_r(rinvsq_S2, rinvsq_S2));
#ifdef EXCL_FORCES
- rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, interact_S2);
+ rinvsix_S2 = gmx_simd_blendzero_r(rinvsix_S2, interact_S2);
#endif
#endif
-#ifdef VDW_CUTOFF_CHECK
- rinvsix_S0 = gmx_blendzero_pr(rinvsix_S0, wco_vdw_S0);
+
+#if defined LJ_CUT || defined LJ_POT_SWITCH
+ /* We have plain LJ or LJ-PME with simple C6/6 C12/12 coefficients */
+ FrLJ6_S0 = gmx_simd_mul_r(c6_S0, rinvsix_S0);
+#ifndef HALF_LJ
+ FrLJ6_S2 = gmx_simd_mul_r(c6_S2, rinvsix_S2);
+#endif
+ FrLJ12_S0 = gmx_simd_mul_r(c12_S0, gmx_simd_mul_r(rinvsix_S0, rinvsix_S0));
+#ifndef HALF_LJ
+ FrLJ12_S2 = gmx_simd_mul_r(c12_S2, gmx_simd_mul_r(rinvsix_S2, rinvsix_S2));
+#endif
+#endif
+
+#if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
+ /* We switch the LJ force */
+ r_S0 = gmx_simd_mul_r(rsq_S0, rinv_S0);
+ rsw_S0 = gmx_simd_max_r(gmx_simd_sub_r(r_S0, rswitch_S), zero_S);
+ rsw2_S0 = gmx_simd_mul_r(rsw_S0, rsw_S0);
#ifndef HALF_LJ
- rinvsix_S2 = gmx_blendzero_pr(rinvsix_S2, wco_vdw_S2);
+ r_S2 = gmx_simd_mul_r(rsq_S2, rinv_S2);
+ rsw_S2 = gmx_simd_max_r(gmx_simd_sub_r(r_S2, rswitch_S), zero_S);
+ rsw2_S2 = gmx_simd_mul_r(rsw_S2, rsw_S2);
#endif
#endif
- FrLJ6_S0 = gmx_mul_pr(c6_S0, rinvsix_S0);
+
+#ifdef LJ_FORCE_SWITCH
+
+#define add_fr_switch(fr, rsw, rsw2_r, c2, c3) gmx_simd_fmadd_r(gmx_simd_fmadd_r(c3, rsw, c2), rsw2_r, fr)
+ gmx_simd_real_t rsw2_r_S0 = gmx_simd_mul_r(rsw2_S0, r_S0);
+ FrLJ6_S0 = gmx_simd_mul_r(c6_S0, add_fr_switch(rinvsix_S0, rsw_S0, rsw2_r_S0, p6_fc2_S, p6_fc3_S));
#ifndef HALF_LJ
- FrLJ6_S2 = gmx_mul_pr(c6_S2, rinvsix_S2);
+ gmx_simd_real_t rsw2_r_S2 = gmx_simd_mul_r(rsw2_S2, r_S2);
+ FrLJ6_S2 = gmx_simd_mul_r(c6_S2, add_fr_switch(rinvsix_S2, rsw_S2, rsw2_r_S2, p6_fc2_S, p6_fc3_S));
#endif
- FrLJ12_S0 = gmx_mul_pr(c12_S0, gmx_mul_pr(rinvsix_S0, rinvsix_S0));
+ FrLJ12_S0 = gmx_simd_mul_r(c12_S0, add_fr_switch(gmx_simd_mul_r(rinvsix_S0, rinvsix_S0), rsw_S0, rsw2_r_S0, p12_fc2_S, p12_fc3_S));
#ifndef HALF_LJ
- FrLJ12_S2 = gmx_mul_pr(c12_S2, gmx_mul_pr(rinvsix_S2, rinvsix_S2));
+ FrLJ12_S2 = gmx_simd_mul_r(c12_S2, add_fr_switch(gmx_simd_mul_r(rinvsix_S2, rinvsix_S2), rsw_S2, rsw2_r_S2, p12_fc2_S, p12_fc3_S));
#endif
+#undef add_fr_switch
+#endif /* LJ_FORCE_SWITCH */
+
#endif /* not LJ_COMB_LB */
#ifdef LJ_COMB_LB
- sir_S0 = gmx_mul_pr(sig_S0, rinv_S0);
+ sir_S0 = gmx_simd_mul_r(sig_S0, rinv_S0);
#ifndef HALF_LJ
- sir_S2 = gmx_mul_pr(sig_S2, rinv_S2);
+ sir_S2 = gmx_simd_mul_r(sig_S2, rinv_S2);
#endif
- sir2_S0 = gmx_mul_pr(sir_S0, sir_S0);
+ sir2_S0 = gmx_simd_mul_r(sir_S0, sir_S0);
#ifndef HALF_LJ
- sir2_S2 = gmx_mul_pr(sir_S2, sir_S2);
+ sir2_S2 = gmx_simd_mul_r(sir_S2, sir_S2);
#endif
- sir6_S0 = gmx_mul_pr(sir2_S0, gmx_mul_pr(sir2_S0, sir2_S0));
+ sir6_S0 = gmx_simd_mul_r(sir2_S0, gmx_simd_mul_r(sir2_S0, sir2_S0));
#ifdef EXCL_FORCES
- sir6_S0 = gmx_blendzero_pr(sir6_S0, interact_S0);
+ sir6_S0 = gmx_simd_blendzero_r(sir6_S0, interact_S0);
#endif
#ifndef HALF_LJ
- sir6_S2 = gmx_mul_pr(sir2_S2, gmx_mul_pr(sir2_S2, sir2_S2));
+ sir6_S2 = gmx_simd_mul_r(sir2_S2, gmx_simd_mul_r(sir2_S2, sir2_S2));
#ifdef EXCL_FORCES
- sir6_S2 = gmx_blendzero_pr(sir6_S2, interact_S2);
+ sir6_S2 = gmx_simd_blendzero_r(sir6_S2, interact_S2);
#endif
#endif
#ifdef VDW_CUTOFF_CHECK
- sir6_S0 = gmx_blendzero_pr(sir6_S0, wco_vdw_S0);
+ sir6_S0 = gmx_simd_blendzero_r(sir6_S0, wco_vdw_S0);
#ifndef HALF_LJ
- sir6_S2 = gmx_blendzero_pr(sir6_S2, wco_vdw_S2);
+ sir6_S2 = gmx_simd_blendzero_r(sir6_S2, wco_vdw_S2);
#endif
#endif
- FrLJ6_S0 = gmx_mul_pr(eps_S0, sir6_S0);
+ FrLJ6_S0 = gmx_simd_mul_r(eps_S0, sir6_S0);
#ifndef HALF_LJ
- FrLJ6_S2 = gmx_mul_pr(eps_S2, sir6_S2);
+ FrLJ6_S2 = gmx_simd_mul_r(eps_S2, sir6_S2);
#endif
- FrLJ12_S0 = gmx_mul_pr(FrLJ6_S0, sir6_S0);
+ FrLJ12_S0 = gmx_simd_mul_r(FrLJ6_S0, sir6_S0);
#ifndef HALF_LJ
- FrLJ12_S2 = gmx_mul_pr(FrLJ6_S2, sir6_S2);
+ FrLJ12_S2 = gmx_simd_mul_r(FrLJ6_S2, sir6_S2);
#endif
#if defined CALC_ENERGIES
/* We need C6 and C12 to calculate the LJ potential shift */
- sig2_S0 = gmx_mul_pr(sig_S0, sig_S0);
+ sig2_S0 = gmx_simd_mul_r(sig_S0, sig_S0);
#ifndef HALF_LJ
- sig2_S2 = gmx_mul_pr(sig_S2, sig_S2);
+ sig2_S2 = gmx_simd_mul_r(sig_S2, sig_S2);
#endif
- sig6_S0 = gmx_mul_pr(sig2_S0, gmx_mul_pr(sig2_S0, sig2_S0));
+ sig6_S0 = gmx_simd_mul_r(sig2_S0, gmx_simd_mul_r(sig2_S0, sig2_S0));
#ifndef HALF_LJ
- sig6_S2 = gmx_mul_pr(sig2_S2, gmx_mul_pr(sig2_S2, sig2_S2));
+ sig6_S2 = gmx_simd_mul_r(sig2_S2, gmx_simd_mul_r(sig2_S2, sig2_S2));
#endif
- c6_S0 = gmx_mul_pr(eps_S0, sig6_S0);
+ gmx_simd_real_t c6_S0 = gmx_simd_mul_r(eps_S0, sig6_S0);
#ifndef HALF_LJ
- c6_S2 = gmx_mul_pr(eps_S2, sig6_S2);
+ gmx_simd_real_t c6_S2 = gmx_simd_mul_r(eps_S2, sig6_S2);
#endif
- c12_S0 = gmx_mul_pr(c6_S0, sig6_S0);
+ gmx_simd_real_t c12_S0 = gmx_simd_mul_r(c6_S0, sig6_S0);
#ifndef HALF_LJ
- c12_S2 = gmx_mul_pr(c6_S2, sig6_S2);
+ gmx_simd_real_t c12_S2 = gmx_simd_mul_r(c6_S2, sig6_S2);
#endif
#endif
#endif /* LJ_COMB_LB */
+ /* Determine the total scalar LJ force*r */
+ frLJ_S0 = gmx_simd_sub_r(FrLJ12_S0, FrLJ6_S0);
+#ifndef HALF_LJ
+ frLJ_S2 = gmx_simd_sub_r(FrLJ12_S2, FrLJ6_S2);
+#endif
+
+#if (defined LJ_CUT || defined LJ_FORCE_SWITCH) && defined CALC_ENERGIES
+
+#ifdef LJ_CUT
+ /* Calculate the LJ energies, with constant potential shift */
+ gmx_simd_real_t VLJ6_S0 = gmx_simd_mul_r(sixth_S, gmx_simd_fmadd_r(c6_S0, p6_cpot_S, FrLJ6_S0));
+#ifndef HALF_LJ
+ gmx_simd_real_t VLJ6_S2 = gmx_simd_mul_r(sixth_S, gmx_simd_fmadd_r(c6_S2, p6_cpot_S, FrLJ6_S2));
+#endif
+ gmx_simd_real_t VLJ12_S0 = gmx_simd_mul_r(twelveth_S, gmx_simd_fmadd_r(c12_S0, p12_cpot_S, FrLJ12_S0));
+#ifndef HALF_LJ
+ gmx_simd_real_t VLJ12_S2 = gmx_simd_mul_r(twelveth_S, gmx_simd_fmadd_r(c12_S2, p12_cpot_S, FrLJ12_S2));
+#endif
+#endif /* LJ_CUT */
+
+#ifdef LJ_FORCE_SWITCH
+#define v_fswitch_pr(rsw, rsw2, c0, c3, c4) gmx_simd_fmadd_r(gmx_simd_fmadd_r(c4, rsw, c3), gmx_simd_mul_r(rsw2, rsw), c0)
+
+ gmx_simd_real_t VLJ6_S0 = gmx_simd_mul_r(c6_S0, gmx_simd_fmadd_r(sixth_S, rinvsix_S0, v_fswitch_pr(rsw_S0, rsw2_S0, p6_6cpot_S, p6_vc3_S, p6_vc4_S)));
+#ifndef HALF_LJ
+ gmx_simd_real_t VLJ6_S2 = gmx_simd_mul_r(c6_S2, gmx_simd_fmadd_r(sixth_S, rinvsix_S2, v_fswitch_pr(rsw_S2, rsw2_S2, p6_6cpot_S, p6_vc3_S, p6_vc4_S)));
+#endif
+ gmx_simd_real_t VLJ12_S0 = gmx_simd_mul_r(c12_S0, gmx_simd_fmadd_r(twelveth_S, gmx_simd_mul_r(rinvsix_S0, rinvsix_S0), v_fswitch_pr(rsw_S0, rsw2_S0, p12_12cpot_S, p12_vc3_S, p12_vc4_S)));
+#ifndef HALF_LJ
+ gmx_simd_real_t VLJ12_S2 = gmx_simd_mul_r(c12_S2, gmx_simd_fmadd_r(twelveth_S, gmx_simd_mul_r(rinvsix_S2, rinvsix_S2), v_fswitch_pr(rsw_S2, rsw2_S2, p12_12cpot_S, p12_vc3_S, p12_vc4_S)));
+#endif
+#undef v_fswitch_pr
+#endif /* LJ_FORCE_SWITCH */
+
+ /* Add up the repulsion and dispersion */
+ gmx_simd_real_t VLJ_S0 = gmx_simd_sub_r(VLJ12_S0, VLJ6_S0);
+#ifndef HALF_LJ
+ gmx_simd_real_t VLJ_S2 = gmx_simd_sub_r(VLJ12_S2, VLJ6_S2);
+#endif
+
+#endif /* (LJ_CUT || LJ_FORCE_SWITCH) && CALC_ENERGIES */
+
+#ifdef LJ_POT_SWITCH
+ /* We always need the potential, since it is needed for the force */
+ gmx_simd_real_t VLJ_S0 = gmx_simd_fnmadd_r(sixth_S, FrLJ6_S0, gmx_simd_mul_r(twelveth_S, FrLJ12_S0));
+#ifndef HALF_LJ
+ gmx_simd_real_t VLJ_S2 = gmx_simd_fnmadd_r(sixth_S, FrLJ6_S2, gmx_simd_mul_r(twelveth_S, FrLJ12_S2));
+#endif
+
+ {
+ gmx_simd_real_t sw_S0, dsw_S0;
+#ifndef HALF_LJ
+ gmx_simd_real_t sw_S2, dsw_S2;
+#endif
+
+#define switch_pr(rsw, rsw2, c3, c4, c5) gmx_simd_fmadd_r(gmx_simd_fmadd_r(gmx_simd_fmadd_r(c5, rsw, c4), rsw, c3), gmx_simd_mul_r(rsw2, rsw), one_S)
+#define dswitch_pr(rsw, rsw2, c2, c3, c4) gmx_simd_mul_r(gmx_simd_fmadd_r(gmx_simd_fmadd_r(c4, rsw, c3), rsw, c2), rsw2)
+
+ sw_S0 = switch_pr(rsw_S0, rsw2_S0, swV3_S, swV4_S, swV5_S);
+ dsw_S0 = dswitch_pr(rsw_S0, rsw2_S0, swF2_S, swF3_S, swF4_S);
+#ifndef HALF_LJ
+ sw_S2 = switch_pr(rsw_S2, rsw2_S2, swV3_S, swV4_S, swV5_S);
+ dsw_S2 = dswitch_pr(rsw_S2, rsw2_S2, swF2_S, swF3_S, swF4_S);
+#endif
+ frLJ_S0 = gmx_simd_fnmadd_r(gmx_simd_mul_r(dsw_S0, VLJ_S0), r_S0, gmx_simd_mul_r(sw_S0, frLJ_S0));
+#ifndef HALF_LJ
+ frLJ_S2 = gmx_simd_fnmadd_r(gmx_simd_mul_r(dsw_S2, VLJ_S2), r_S2, gmx_simd_mul_r(sw_S2, frLJ_S2));
+#endif
+#ifdef CALC_ENERGIES
+ VLJ_S0 = gmx_simd_mul_r(sw_S0, VLJ_S0);
+#ifndef HALF_LJ
+ VLJ_S2 = gmx_simd_mul_r(sw_S2, VLJ_S2);
+#endif
+#endif
+
+#undef switch_pr
+#undef dswitch_pr
+ }
+#endif /* LJ_POT_SWITCH */
+
+#if defined CALC_ENERGIES && defined CHECK_EXCLS
+ /* The potential shift should be removed for excluded pairs */
+ VLJ_S0 = gmx_simd_blendzero_r(VLJ_S0, interact_S0);
+#ifndef HALF_LJ
+ VLJ_S2 = gmx_simd_blendzero_r(VLJ_S2, interact_S2);
+#endif
+#endif
+
+#ifdef LJ_EWALD_GEOM
+ {
+ gmx_simd_real_t c6s_j_S;
+ gmx_simd_real_t c6grid_S0, rinvsix_nm_S0, cr2_S0, expmcr2_S0, poly_S0;
+#ifndef HALF_LJ
+ gmx_simd_real_t c6grid_S2, rinvsix_nm_S2, cr2_S2, expmcr2_S2, poly_S2;
+#endif
+#ifdef CALC_ENERGIES
+ gmx_simd_real_t sh_mask_S0;
+#ifndef HALF_LJ
+ gmx_simd_real_t sh_mask_S2;
+#endif
+#endif
+
+ /* Determine C6 for the grid using the geometric combination rule */
+ gmx_loaddh_pr(&c6s_j_S, ljc+aj2+0);
+ c6grid_S0 = gmx_simd_mul_r(c6s_S0, c6s_j_S);
+#ifndef HALF_LJ
+ c6grid_S2 = gmx_simd_mul_r(c6s_S2, c6s_j_S);
+#endif
+
+#ifdef CHECK_EXCLS
+ /* Recalculate rinvsix without exclusion mask (compiler might optimize) */
+ rinvsix_nm_S0 = gmx_simd_mul_r(rinvsq_S0, gmx_simd_mul_r(rinvsq_S0, rinvsq_S0));
+#ifndef HALF_LJ
+ rinvsix_nm_S2 = gmx_simd_mul_r(rinvsq_S2, gmx_simd_mul_r(rinvsq_S2, rinvsq_S2));
+#endif
+#else
+ /* We didn't use a mask, so we can copy */
+ rinvsix_nm_S0 = rinvsix_S0;
+#ifndef HALF_LJ
+ rinvsix_nm_S2 = rinvsix_S2;
+#endif
+#endif
+
+ /* Mask for the cut-off to avoid overflow of cr2^2 */
+ cr2_S0 = gmx_simd_mul_r(lje_c2_S, gmx_simd_blendzero_r(rsq_S0, wco_vdw_S0));
+#ifndef HALF_LJ
+ cr2_S2 = gmx_simd_mul_r(lje_c2_S, gmx_simd_blendzero_r(rsq_S2, wco_vdw_S2));
+#endif
+ expmcr2_S0 = gmx_simd_exp_r(gmx_simd_mul_r(mone_S, cr2_S0));
+#ifndef HALF_LJ
+ expmcr2_S2 = gmx_simd_exp_r(gmx_simd_mul_r(mone_S, cr2_S2));
+#endif
+
+ /* 1 + cr2 + 1/2*cr2^2 */
+ poly_S0 = gmx_simd_fmadd_r(gmx_simd_fmadd_r(half_S, cr2_S0, one_S), cr2_S0, one_S);
+#ifndef HALF_LJ
+ poly_S2 = gmx_simd_fmadd_r(gmx_simd_fmadd_r(half_S, cr2_S2, one_S), cr2_S2, one_S);
+#endif
+
+ /* We calculate LJ F*r = (6*C6)*(r^-6 - F_mesh/6), we use:
+ * r^-6*cexp*(1 + cr2 + cr2^2/2 + cr2^3/6) = cexp*(r^-6*poly + c^6/6)
+ */
+ frLJ_S0 = gmx_simd_fmadd_r(c6grid_S0, gmx_simd_fnmadd_r(expmcr2_S0, gmx_simd_fmadd_r(rinvsix_nm_S0, poly_S0, lje_c6_6_S), rinvsix_nm_S0), frLJ_S0);
+#ifndef HALF_LJ
+ frLJ_S2 = gmx_simd_fmadd_r(c6grid_S2, gmx_simd_fnmadd_r(expmcr2_S2, gmx_simd_fmadd_r(rinvsix_nm_S2, poly_S2, lje_c6_6_S), rinvsix_nm_S2), frLJ_S2);
+#endif
+
+#ifdef CALC_ENERGIES
+#ifdef CHECK_EXCLS
+ sh_mask_S0 = gmx_simd_blendzero_r(lje_vc_S, interact_S0);
+#ifndef HALF_LJ
+ sh_mask_S2 = gmx_simd_blendzero_r(lje_vc_S, interact_S2);
+#endif
+#else
+ sh_mask_S0 = lje_vc_S;
+#ifndef HALF_LJ
+ sh_mask_S2 = lje_vc_S;
+#endif
+#endif
+
+ VLJ_S0 = gmx_simd_fmadd_r(gmx_simd_mul_r(sixth_S, c6grid_S0), gmx_simd_fmadd_r(rinvsix_nm_S0, gmx_simd_fnmadd_r(expmcr2_S0, poly_S0, one_S), sh_mask_S0), VLJ_S0);
+#ifndef HALF_LJ
+ VLJ_S2 = gmx_simd_fmadd_r(gmx_simd_mul_r(sixth_S, c6grid_S2), gmx_simd_fmadd_r(rinvsix_nm_S2, gmx_simd_fnmadd_r(expmcr2_S2, poly_S2, one_S), sh_mask_S2), VLJ_S2);
+#endif
+#endif /* CALC_ENERGIES */
+ }
+#endif /* LJ_EWALD_GEOM */
+
+#if defined VDW_CUTOFF_CHECK
+ /* frLJ is multiplied later by rinvsq, which is masked for the Coulomb
+ * cut-off, but if the VdW cut-off is shorter, we need to mask with that.
+ */
+ frLJ_S0 = gmx_simd_blendzero_r(frLJ_S0, wco_vdw_S0);
+#ifndef HALF_LJ
+ frLJ_S2 = gmx_simd_blendzero_r(frLJ_S2, wco_vdw_S2);
+#endif
+#endif
+
+#ifdef CALC_ENERGIES
+ /* The potential shift should be removed for pairs beyond cut-off */
+ VLJ_S0 = gmx_simd_blendzero_r(VLJ_S0, wco_vdw_S0);
+#ifndef HALF_LJ
+ VLJ_S2 = gmx_simd_blendzero_r(VLJ_S2, wco_vdw_S2);
+#endif
+#endif
+
#endif /* CALC_LJ */
#ifdef CALC_ENERGIES
#ifdef CALC_COULOMB
#ifndef ENERGY_GROUPS
- vctot_S = gmx_add_pr(vctot_S, gmx_add_pr(vcoul_S0, vcoul_S2));
+ vctot_S = gmx_simd_add_r(vctot_S, gmx_simd_add_r(vcoul_S0, vcoul_S2));
#else
add_ener_grp_halves(vcoul_S0, vctp[0], vctp[1], egp_jj);
add_ener_grp_halves(vcoul_S2, vctp[2], vctp[3], egp_jj);
#endif
#ifdef CALC_LJ
- /* Calculate the LJ energies */
- VLJ6_S0 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S0, gmx_mul_pr(c6_S0, sh_invrc6_S)));
-#ifndef HALF_LJ
- VLJ6_S2 = gmx_mul_pr(sixth_S, gmx_sub_pr(FrLJ6_S2, gmx_mul_pr(c6_S2, sh_invrc6_S)));
-#endif
- VLJ12_S0 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S0, gmx_mul_pr(c12_S0, sh_invrc12_S)));
-#ifndef HALF_LJ
- VLJ12_S2 = gmx_mul_pr(twelveth_S, gmx_sub_pr(FrLJ12_S2, gmx_mul_pr(c12_S2, sh_invrc12_S)));
-#endif
-
- VLJ_S0 = gmx_sub_pr(VLJ12_S0, VLJ6_S0);
-#ifndef HALF_LJ
- VLJ_S2 = gmx_sub_pr(VLJ12_S2, VLJ6_S2);
-#endif
- /* The potential shift should be removed for pairs beyond cut-off */
- VLJ_S0 = gmx_blendzero_pr(VLJ_S0, wco_vdw_S0);
-#ifndef HALF_LJ
- VLJ_S2 = gmx_blendzero_pr(VLJ_S2, wco_vdw_S2);
-#endif
-#ifdef CHECK_EXCLS
- /* The potential shift should be removed for excluded pairs */
- VLJ_S0 = gmx_blendzero_pr(VLJ_S0, interact_S0);
-#ifndef HALF_LJ
- VLJ_S2 = gmx_blendzero_pr(VLJ_S2, interact_S2);
-#endif
-#endif
#ifndef ENERGY_GROUPS
- Vvdwtot_S = gmx_add_pr(Vvdwtot_S,
+ Vvdwtot_S = gmx_simd_add_r(Vvdwtot_S,
#ifndef HALF_LJ
- gmx_add_pr(VLJ_S0, VLJ_S2)
+ gmx_simd_add_r(VLJ_S0, VLJ_S2)
#else
- VLJ_S0
+ VLJ_S0
#endif
- );
+ );
#else
add_ener_grp_halves(VLJ_S0, vvdwtp[0], vvdwtp[1], egp_jj);
#ifndef HALF_LJ
#endif /* CALC_ENERGIES */
#ifdef CALC_LJ
- fscal_S0 = gmx_mul_pr(rinvsq_S0,
#ifdef CALC_COULOMB
- gmx_add_pr(frcoul_S0,
+ fscal_S0 = gmx_simd_mul_r(rinvsq_S0, gmx_simd_add_r(frcoul_S0, frLJ_S0));
#else
- (
+ fscal_S0 = gmx_simd_mul_r(rinvsq_S0, frLJ_S0);
#endif
- gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
#else
- fscal_S0 = gmx_mul_pr(rinvsq_S0, frcoul_S0);
+ fscal_S0 = gmx_simd_mul_r(rinvsq_S0, frcoul_S0);
#endif /* CALC_LJ */
#if defined CALC_LJ && !defined HALF_LJ
- fscal_S2 = gmx_mul_pr(rinvsq_S2,
#ifdef CALC_COULOMB
- gmx_add_pr(frcoul_S2,
+ fscal_S2 = gmx_simd_mul_r(rinvsq_S2, gmx_simd_add_r(frcoul_S2, frLJ_S2));
#else
- (
+ fscal_S2 = gmx_simd_mul_r(rinvsq_S2, frLJ_S2);
#endif
- gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
#else
/* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
- fscal_S2 = gmx_mul_pr(rinvsq_S2, frcoul_S2);
+ fscal_S2 = gmx_simd_mul_r(rinvsq_S2, frcoul_S2);
#endif
/* Calculate temporary vectorial force */
- tx_S0 = gmx_mul_pr(fscal_S0, dx_S0);
- tx_S2 = gmx_mul_pr(fscal_S2, dx_S2);
- ty_S0 = gmx_mul_pr(fscal_S0, dy_S0);
- ty_S2 = gmx_mul_pr(fscal_S2, dy_S2);
- tz_S0 = gmx_mul_pr(fscal_S0, dz_S0);
- tz_S2 = gmx_mul_pr(fscal_S2, dz_S2);
+ tx_S0 = gmx_simd_mul_r(fscal_S0, dx_S0);
+ tx_S2 = gmx_simd_mul_r(fscal_S2, dx_S2);
+ ty_S0 = gmx_simd_mul_r(fscal_S0, dy_S0);
+ ty_S2 = gmx_simd_mul_r(fscal_S2, dy_S2);
+ tz_S0 = gmx_simd_mul_r(fscal_S0, dz_S0);
+ tz_S2 = gmx_simd_mul_r(fscal_S2, dz_S2);
/* Increment i atom force */
- fix_S0 = gmx_add_pr(fix_S0, tx_S0);
- fix_S2 = gmx_add_pr(fix_S2, tx_S2);
- fiy_S0 = gmx_add_pr(fiy_S0, ty_S0);
- fiy_S2 = gmx_add_pr(fiy_S2, ty_S2);
- fiz_S0 = gmx_add_pr(fiz_S0, tz_S0);
- fiz_S2 = gmx_add_pr(fiz_S2, tz_S2);
+ fix_S0 = gmx_simd_add_r(fix_S0, tx_S0);
+ fix_S2 = gmx_simd_add_r(fix_S2, tx_S2);
+ fiy_S0 = gmx_simd_add_r(fiy_S0, ty_S0);
+ fiy_S2 = gmx_simd_add_r(fiy_S2, ty_S2);
+ fiz_S0 = gmx_simd_add_r(fiz_S0, tz_S0);
+ fiz_S2 = gmx_simd_add_r(fiz_S2, tz_S2);
/* Decrement j atom force */
gmx_load_hpr(&fjx_S, f+ajx);
#undef wco_vdw_S0
#undef wco_vdw_S2
-#undef CUTOFF_BLENDV
+#undef NBNXN_CUTOFF_USE_BLENDV
#undef EXCL_FORCES