+ /* Determine the total scalar LJ force*r */
+ frLJ_S0 = gmx_simd_sub_r(FrLJ12_S0, FrLJ6_S0);
+ frLJ_S1 = gmx_simd_sub_r(FrLJ12_S1, FrLJ6_S1);
+#ifndef HALF_LJ
+ frLJ_S2 = gmx_simd_sub_r(FrLJ12_S2, FrLJ6_S2);
+ frLJ_S3 = gmx_simd_sub_r(FrLJ12_S3, FrLJ6_S3);
+#endif
+
+#if defined CALC_ENERGIES && !defined LJ_POT_SWITCH
+#ifndef LJ_FORCE_SWITCH
+ /* Calculate the LJ energies, with constant potential shift */
+ VLJ6_S0 = gmx_simd_mul_r(sixth_S, gmx_simd_fmadd_r(c6_S0, p6_cpot_S, FrLJ6_S0));
+ VLJ6_S1 = gmx_simd_mul_r(sixth_S, gmx_simd_fmadd_r(c6_S1, p6_cpot_S, FrLJ6_S1));
+#ifndef HALF_LJ
+ VLJ6_S2 = gmx_simd_mul_r(sixth_S, gmx_simd_fmadd_r(c6_S2, p6_cpot_S, FrLJ6_S2));
+ VLJ6_S3 = gmx_simd_mul_r(sixth_S, gmx_simd_fmadd_r(c6_S3, p6_cpot_S, FrLJ6_S3));
+#endif
+ VLJ12_S0 = gmx_simd_mul_r(twelveth_S, gmx_simd_fmadd_r(c12_S0, p12_cpot_S, FrLJ12_S0));
+ VLJ12_S1 = gmx_simd_mul_r(twelveth_S, gmx_simd_fmadd_r(c12_S1, p12_cpot_S, FrLJ12_S1));
+#ifndef HALF_LJ
+ VLJ12_S2 = gmx_simd_mul_r(twelveth_S, gmx_simd_fmadd_r(c12_S2, p12_cpot_S, FrLJ12_S2));
+ VLJ12_S3 = gmx_simd_mul_r(twelveth_S, gmx_simd_fmadd_r(c12_S3, p12_cpot_S, FrLJ12_S3));
+#endif
+#else
+
+#define v_fswitch_r(rsw, rsw2, c0, c3, c4) gmx_simd_fmadd_r(gmx_simd_fmadd_r(c4, rsw, c3), gmx_simd_mul_r(rsw2, rsw), c0)
+
+ VLJ6_S0 = gmx_simd_mul_r(c6_S0, gmx_simd_fmadd_r(sixth_S, rinvsix_S0, v_fswitch_r(rsw_S0, rsw2_S0, p6_6cpot_S, p6_vc3_S, p6_vc4_S)));
+ VLJ6_S1 = gmx_simd_mul_r(c6_S1, gmx_simd_fmadd_r(sixth_S, rinvsix_S1, v_fswitch_r(rsw_S1, rsw2_S1, p6_6cpot_S, p6_vc3_S, p6_vc4_S)));
+#ifndef HALF_LJ
+ VLJ6_S2 = gmx_simd_mul_r(c6_S2, gmx_simd_fmadd_r(sixth_S, rinvsix_S2, v_fswitch_r(rsw_S2, rsw2_S2, p6_6cpot_S, p6_vc3_S, p6_vc4_S)));
+ VLJ6_S3 = gmx_simd_mul_r(c6_S3, gmx_simd_fmadd_r(sixth_S, rinvsix_S3, v_fswitch_r(rsw_S3, rsw2_S3, p6_6cpot_S, p6_vc3_S, p6_vc4_S)));
+#endif
+ VLJ12_S0 = gmx_simd_mul_r(c12_S0, gmx_simd_fmadd_r(twelveth_S, gmx_simd_mul_r(rinvsix_S0, rinvsix_S0), v_fswitch_r(rsw_S0, rsw2_S0, p12_12cpot_S, p12_vc3_S, p12_vc4_S)));
+ VLJ12_S1 = gmx_simd_mul_r(c12_S1, gmx_simd_fmadd_r(twelveth_S, gmx_simd_mul_r(rinvsix_S1, rinvsix_S1), v_fswitch_r(rsw_S1, rsw2_S1, p12_12cpot_S, p12_vc3_S, p12_vc4_S)));
+#ifndef HALF_LJ
+ VLJ12_S2 = gmx_simd_mul_r(c12_S2, gmx_simd_fmadd_r(twelveth_S, gmx_simd_mul_r(rinvsix_S2, rinvsix_S2), v_fswitch_r(rsw_S2, rsw2_S2, p12_12cpot_S, p12_vc3_S, p12_vc4_S)));
+ VLJ12_S3 = gmx_simd_mul_r(c12_S3, gmx_simd_fmadd_r(twelveth_S, gmx_simd_mul_r(rinvsix_S3, rinvsix_S3), v_fswitch_r(rsw_S3, rsw2_S3, p12_12cpot_S, p12_vc3_S, p12_vc4_S)));
+#endif
+#undef v_fswitch_r
+#endif /* LJ_FORCE_SWITCH */
+
+ /* Add up the repulsion and dispersion */
+ VLJ_S0 = gmx_simd_sub_r(VLJ12_S0, VLJ6_S0);
+ VLJ_S1 = gmx_simd_sub_r(VLJ12_S1, VLJ6_S1);
+#ifndef HALF_LJ
+ VLJ_S2 = gmx_simd_sub_r(VLJ12_S2, VLJ6_S2);
+ VLJ_S3 = gmx_simd_sub_r(VLJ12_S3, VLJ6_S3);
+#endif
+#endif /* CALC_ENERGIES && !LJ_POT_SWITCH */
+
+#ifdef LJ_POT_SWITCH
+ /* We always need the potential, since it is needed for the force */
+ VLJ_S0 = gmx_simd_fnmadd_r(sixth_S, FrLJ6_S0, gmx_simd_mul_r(twelveth_S, FrLJ12_S0));
+ VLJ_S1 = gmx_simd_fnmadd_r(sixth_S, FrLJ6_S1, gmx_simd_mul_r(twelveth_S, FrLJ12_S1));
+#ifndef HALF_LJ
+ VLJ_S2 = gmx_simd_fnmadd_r(sixth_S, FrLJ6_S2, gmx_simd_mul_r(twelveth_S, FrLJ12_S2));
+ VLJ_S3 = gmx_simd_fnmadd_r(sixth_S, FrLJ6_S3, gmx_simd_mul_r(twelveth_S, FrLJ12_S3));
+#endif
+
+ {
+ gmx_simd_real_t sw_S0, dsw_S0;
+ gmx_simd_real_t sw_S1, dsw_S1;
+#ifndef HALF_LJ
+ gmx_simd_real_t sw_S2, dsw_S2;
+ gmx_simd_real_t sw_S3, dsw_S3;
+#endif
+
+#define switch_r(rsw, rsw2, c3, c4, c5) gmx_simd_fmadd_r(gmx_simd_fmadd_r(gmx_simd_fmadd_r(c5, rsw, c4), rsw, c3), gmx_simd_mul_r(rsw2, rsw), one_S)
+#define dswitch_r(rsw, rsw2, c2, c3, c4) gmx_simd_mul_r(gmx_simd_fmadd_r(gmx_simd_fmadd_r(c4, rsw, c3), rsw, c2), rsw2)
+
+ sw_S0 = switch_r(rsw_S0, rsw2_S0, swV3_S, swV4_S, swV5_S);
+ dsw_S0 = dswitch_r(rsw_S0, rsw2_S0, swF2_S, swF3_S, swF4_S);
+ sw_S1 = switch_r(rsw_S1, rsw2_S1, swV3_S, swV4_S, swV5_S);
+ dsw_S1 = dswitch_r(rsw_S1, rsw2_S1, swF2_S, swF3_S, swF4_S);
+#ifndef HALF_LJ
+ sw_S2 = switch_r(rsw_S2, rsw2_S2, swV3_S, swV4_S, swV5_S);
+ dsw_S2 = dswitch_r(rsw_S2, rsw2_S2, swF2_S, swF3_S, swF4_S);
+ sw_S3 = switch_r(rsw_S3, rsw2_S3, swV3_S, swV4_S, swV5_S);
+ dsw_S3 = dswitch_r(rsw_S3, rsw2_S3, swF2_S, swF3_S, swF4_S);
+#endif
+ frLJ_S0 = gmx_simd_fnmadd_r(gmx_simd_mul_r(dsw_S0, VLJ_S0), r_S0, gmx_simd_mul_r(sw_S0, frLJ_S0));
+ frLJ_S1 = gmx_simd_fnmadd_r(gmx_simd_mul_r(dsw_S1, VLJ_S1), r_S1, gmx_simd_mul_r(sw_S1, frLJ_S1));
+#ifndef HALF_LJ
+ frLJ_S2 = gmx_simd_fnmadd_r(gmx_simd_mul_r(dsw_S2, VLJ_S2), r_S2, gmx_simd_mul_r(sw_S2, frLJ_S2));
+ frLJ_S3 = gmx_simd_fnmadd_r(gmx_simd_mul_r(dsw_S3, VLJ_S3), r_S3, gmx_simd_mul_r(sw_S3, frLJ_S3));
+#endif
+#ifdef CALC_ENERGIES
+ VLJ_S0 = gmx_simd_mul_r(sw_S0, VLJ_S0);
+ VLJ_S1 = gmx_simd_mul_r(sw_S1, VLJ_S1);
+#ifndef HALF_LJ
+ VLJ_S2 = gmx_simd_mul_r(sw_S2, VLJ_S2);
+ VLJ_S3 = gmx_simd_mul_r(sw_S3, VLJ_S3);
+#endif
+#endif
+
+#undef switch_r
+#undef dswitch_r
+ }
+#endif /* LJ_POT_SWITCH */
+
+#if defined VDW_CUTOFF_CHECK
+ /* frLJ is multiplied later by rinvsq, which is masked for the Coulomb
+ * cut-off, but if the VdW cut-off is shorter, we need to mask with that.
+ */
+ frLJ_S0 = gmx_simd_blendzero_r(frLJ_S0, wco_vdw_S0);
+ frLJ_S1 = gmx_simd_blendzero_r(frLJ_S1, wco_vdw_S1);
+#ifndef HALF_LJ
+ frLJ_S2 = gmx_simd_blendzero_r(frLJ_S2, wco_vdw_S2);
+ frLJ_S3 = gmx_simd_blendzero_r(frLJ_S3, wco_vdw_S3);
+#endif
+#endif
+
+#ifdef CALC_ENERGIES
+ /* The potential shift should be removed for pairs beyond cut-off */
+ VLJ_S0 = gmx_simd_blendzero_r(VLJ_S0, wco_vdw_S0);
+ VLJ_S1 = gmx_simd_blendzero_r(VLJ_S1, wco_vdw_S1);
+#ifndef HALF_LJ
+ VLJ_S2 = gmx_simd_blendzero_r(VLJ_S2, wco_vdw_S2);
+ VLJ_S3 = gmx_simd_blendzero_r(VLJ_S3, wco_vdw_S3);
+#endif
+#endif
+
+#if defined CALC_ENERGIES && defined CHECK_EXCLS
+ /* The potential shift should be removed for excluded pairs */
+ VLJ_S0 = gmx_simd_blendzero_r(VLJ_S0, interact_S0);
+ VLJ_S1 = gmx_simd_blendzero_r(VLJ_S1, interact_S1);
+#ifndef HALF_LJ
+ VLJ_S2 = gmx_simd_blendzero_r(VLJ_S2, interact_S2);
+ VLJ_S3 = gmx_simd_blendzero_r(VLJ_S3, interact_S3);
+#endif
+#endif
+
+