src/gromacs/nbnxm/kernels_simd_4xm/kernel_inner.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
   5  * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
   6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   7  * and including many others, as listed in the AUTHORS file in the
   8  * top-level source directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 /* Doxygen gets confused (buggy) about the block in this file in combination with
  38  * the  namespace prefix, and thinks store is documented here.
  39  * This will solve itself with the second-generation nbnxn kernels, so for now
  40  * we just tell Doxygen to stay out.
  41  */
  42 #ifndef DOXYGEN
  43
  44 /* This is the innermost loop contents for the 4 x N atom simd kernel.
  45  * This flavor of the kernel calculates interactions of 4 i-atoms
  46  * with N j-atoms stored in N wide simd registers.
  47  */
  48
  49 /* When calculating RF or Ewald interactions we calculate the electrostatic/LJ
  50  * forces on excluded atom pairs here in the non-bonded loops.
  51  * But when energies and/or virial is required we calculate them
  52  * separately to as then it is easier to separate the energy and virial
  53  * contributions.
  54  */
  55 #    if defined CHECK_EXCLS && (defined CALC_COULOMB || defined LJ_EWALD_GEOM)
  56 #        define EXCL_FORCES
  57 #    endif
  58
  59 {
  60     int cj, ajx, ajy, ajz;
  61     int gmx_unused aj;
  62
  63 #    ifdef ENERGY_GROUPS
  64     /* Energy group indices for two atoms packed into one int */
  65     int egp_jj[UNROLLJ / 2];
  66 #    endif
  67
  68 #    ifdef CHECK_EXCLS
  69     /* Interaction (non-exclusion) mask of all 1's or 0's */
  70     SimdBool interact_S0;
  71     SimdBool interact_S1;
  72     SimdBool interact_S2;
  73     SimdBool interact_S3;
  74 #    endif
  75
  76     SimdReal jx_S, jy_S, jz_S;
  77     SimdReal dx_S0, dy_S0, dz_S0;
  78     SimdReal dx_S1, dy_S1, dz_S1;
  79     SimdReal dx_S2, dy_S2, dz_S2;
  80     SimdReal dx_S3, dy_S3, dz_S3;
  81     SimdReal tx_S0, ty_S0, tz_S0;
  82     SimdReal tx_S1, ty_S1, tz_S1;
  83     SimdReal tx_S2, ty_S2, tz_S2;
  84     SimdReal tx_S3, ty_S3, tz_S3;
  85     SimdReal rsq_S0, rinv_S0, rinvsq_S0;
  86     SimdReal rsq_S1, rinv_S1, rinvsq_S1;
  87     SimdReal rsq_S2, rinv_S2, rinvsq_S2;
  88     SimdReal rsq_S3, rinv_S3, rinvsq_S3;
  89
  90     /* wco: within cut-off, mask of all 1's or 0's */
  91     SimdBool wco_S0;
  92     SimdBool wco_S1;
  93     SimdBool wco_S2;
  94     SimdBool wco_S3;
  95
  96 #    ifdef VDW_CUTOFF_CHECK
  97     SimdBool wco_vdw_S0;
  98     SimdBool wco_vdw_S1;
  99 #        ifndef HALF_LJ
 100     SimdBool wco_vdw_S2;
 101     SimdBool wco_vdw_S3;
 102 #        endif
 103 #    endif
 104
 105 #    if (defined CALC_COULOMB && defined CALC_COUL_TAB) || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
 106     SimdReal                                                                                  r_S0;
 107     SimdReal                                                                                  r_S1;
 108 #        if (defined CALC_COULOMB && defined CALC_COUL_TAB) || !defined HALF_LJ
 109     SimdReal r_S2;
 110     SimdReal r_S3;
 111 #        endif
 112 #    endif
 113
 114 #    if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
 115     SimdReal                               rsw_S0, rsw2_S0;
 116     SimdReal                               rsw_S1, rsw2_S1;
 117 #        ifndef HALF_LJ
 118     SimdReal rsw_S2, rsw2_S2;
 119     SimdReal rsw_S3, rsw2_S3;
 120 #        endif
 121 #    endif
 122
 123 #    ifdef CALC_COULOMB
 124 #        ifdef CHECK_EXCLS
 125     /* 1/r masked with the interaction mask */
 126     SimdReal rinv_ex_S0;
 127     SimdReal rinv_ex_S1;
 128     SimdReal rinv_ex_S2;
 129     SimdReal rinv_ex_S3;
 130 #        endif
 131     SimdReal jq_S;
 132     SimdReal qq_S0;
 133     SimdReal qq_S1;
 134     SimdReal qq_S2;
 135     SimdReal qq_S3;
 136 #        ifdef CALC_COUL_TAB
 137     /* The force (PME mesh force) we need to subtract from 1/r^2 */
 138     SimdReal fsub_S0;
 139     SimdReal fsub_S1;
 140     SimdReal fsub_S2;
 141     SimdReal fsub_S3;
 142 #        endif
 143 #        ifdef CALC_COUL_EWALD
 144     SimdReal brsq_S0, brsq_S1, brsq_S2, brsq_S3;
 145     SimdReal ewcorr_S0, ewcorr_S1, ewcorr_S2, ewcorr_S3;
 146 #        endif
 147
 148     /* frcoul = (1/r - fsub)*r */
 149     SimdReal frcoul_S0;
 150     SimdReal frcoul_S1;
 151     SimdReal frcoul_S2;
 152     SimdReal frcoul_S3;
 153 #        ifdef CALC_COUL_TAB
 154     /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
 155     SimdReal rs_S0, rf_S0, frac_S0;
 156     SimdReal rs_S1, rf_S1, frac_S1;
 157     SimdReal rs_S2, rf_S2, frac_S2;
 158     SimdReal rs_S3, rf_S3, frac_S3;
 159     /* Table index: rs truncated to an int */
 160     SimdInt32 ti_S0, ti_S1, ti_S2, ti_S3;
 161     /* Linear force table values */
 162     SimdReal ctab0_S0, ctab1_S0;
 163     SimdReal ctab0_S1, ctab1_S1;
 164     SimdReal ctab0_S2, ctab1_S2;
 165     SimdReal ctab0_S3, ctab1_S3;
 166 #            ifdef CALC_ENERGIES
 167     /* Quadratic energy table value */
 168     SimdReal ctabv_S0, dum_S0;
 169     SimdReal ctabv_S1, dum_S1;
 170     SimdReal ctabv_S2, dum_S2;
 171     SimdReal ctabv_S3, dum_S3;
 172 #            endif
 173 #        endif
 174 #        if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 175     /* The potential (PME mesh) we need to subtract from 1/r */
 176     SimdReal vc_sub_S0;
 177     SimdReal vc_sub_S1;
 178     SimdReal vc_sub_S2;
 179     SimdReal vc_sub_S3;
 180 #        endif
 181 #        ifdef CALC_ENERGIES
 182     /* Electrostatic potential */
 183     SimdReal vcoul_S0;
 184     SimdReal vcoul_S1;
 185     SimdReal vcoul_S2;
 186     SimdReal vcoul_S3;
 187 #        endif
 188 #    endif
 189     /* The force times 1/r */
 190     SimdReal fscal_S0;
 191     SimdReal fscal_S1;
 192     SimdReal fscal_S2;
 193     SimdReal fscal_S3;
 194
 195 #    ifdef CALC_LJ
 196 #        ifdef LJ_COMB_LB
 197     /* LJ sigma_j/2 and sqrt(epsilon_j) */
 198     SimdReal hsig_j_S, seps_j_S;
 199     /* LJ sigma_ij and epsilon_ij */
 200     SimdReal sig_S0, eps_S0;
 201     SimdReal sig_S1, eps_S1;
 202 #            ifndef HALF_LJ
 203     SimdReal sig_S2, eps_S2;
 204     SimdReal sig_S3, eps_S3;
 205 #            endif
 206 #            ifdef CALC_ENERGIES
 207     SimdReal sig2_S0, sig6_S0;
 208     SimdReal sig2_S1, sig6_S1;
 209 #                ifndef HALF_LJ
 210     SimdReal sig2_S2, sig6_S2;
 211     SimdReal sig2_S3, sig6_S3;
 212 #                endif
 213 #            endif /* LJ_COMB_LB */
 214 #        endif     /* CALC_LJ */
 215
 216 #        ifdef LJ_COMB_GEOM
 217     SimdReal c6s_j_S, c12s_j_S;
 218 #        endif
 219
 220 #        if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM
 221     /* Index for loading LJ parameters, complicated when interleaving */
 222     int aj2;
 223 #        endif
 224
 225     /* Intermediate variables for LJ calculation */
 226 #        ifndef LJ_COMB_LB
 227     SimdReal rinvsix_S0;
 228     SimdReal rinvsix_S1;
 229 #            ifndef HALF_LJ
 230     SimdReal rinvsix_S2;
 231     SimdReal rinvsix_S3;
 232 #            endif
 233 #        endif
 234 #        ifdef LJ_COMB_LB
 235     SimdReal sir_S0, sir2_S0, sir6_S0;
 236     SimdReal sir_S1, sir2_S1, sir6_S1;
 237 #            ifndef HALF_LJ
 238     SimdReal sir_S2, sir2_S2, sir6_S2;
 239     SimdReal sir_S3, sir2_S3, sir6_S3;
 240 #            endif
 241 #        endif
 242
 243     SimdReal FrLJ6_S0, FrLJ12_S0, frLJ_S0;
 244     SimdReal FrLJ6_S1, FrLJ12_S1, frLJ_S1;
 245 #        ifndef HALF_LJ
 246     SimdReal FrLJ6_S2, FrLJ12_S2, frLJ_S2;
 247     SimdReal FrLJ6_S3, FrLJ12_S3, frLJ_S3;
 248 #        endif
 249 #    endif /* CALC_LJ */
 250
 251     /* j-cluster index */
 252     cj = l_cj[cjind].cj;
 253
 254     /* Atom indices (of the first atom in the cluster) */
 255     aj = cj * UNROLLJ;
 256 #    if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM)
 257 #        if UNROLLJ == STRIDE
 258     aj2 = aj * 2;
 259 #        else
 260     aj2 = (cj >> 1) * 2 * STRIDE + (cj & 1) * UNROLLJ;
 261 #        endif
 262 #    endif
 263 #    if UNROLLJ == STRIDE
 264     ajx = aj * DIM;
 265 #    else
 266     ajx = (cj >> 1) * DIM * STRIDE + (cj & 1) * UNROLLJ;
 267 #    endif
 268     ajy = ajx + STRIDE;
 269     ajz = ajy + STRIDE;
 270
 271 #    ifdef CHECK_EXCLS
 272     gmx_load_simd_4xn_interactions(static_cast<int>(l_cj[cjind].excl),
 273                                    filter_S0,
 274                                    filter_S1,
 275                                    filter_S2,
 276                                    filter_S3,
 277                                    nbat->simdMasks.interaction_array.data(),
 278                                    &interact_S0,
 279                                    &interact_S1,
 280                                    &interact_S2,
 281                                    &interact_S3);
 282 #    endif /* CHECK_EXCLS */
 283
 284     /* load j atom coordinates */
 285     jx_S = load<SimdReal>(x + ajx);
 286     jy_S = load<SimdReal>(x + ajy);
 287     jz_S = load<SimdReal>(x + ajz);
 288
 289     /* Calculate distance */
 290     dx_S0 = ix_S0 - jx_S;
 291     dy_S0 = iy_S0 - jy_S;
 292     dz_S0 = iz_S0 - jz_S;
 293     dx_S1 = ix_S1 - jx_S;
 294     dy_S1 = iy_S1 - jy_S;
 295     dz_S1 = iz_S1 - jz_S;
 296     dx_S2 = ix_S2 - jx_S;
 297     dy_S2 = iy_S2 - jy_S;
 298     dz_S2 = iz_S2 - jz_S;
 299     dx_S3 = ix_S3 - jx_S;
 300     dy_S3 = iy_S3 - jy_S;
 301     dz_S3 = iz_S3 - jz_S;
 302
 303     /* rsq = dx*dx+dy*dy+dz*dz */
 304     rsq_S0 = norm2(dx_S0, dy_S0, dz_S0);
 305     rsq_S1 = norm2(dx_S1, dy_S1, dz_S1);
 306     rsq_S2 = norm2(dx_S2, dy_S2, dz_S2);
 307     rsq_S3 = norm2(dx_S3, dy_S3, dz_S3);
 308
 309     /* Do the cut-off check */
 310     wco_S0 = (rsq_S0 < rc2_S);
 311     wco_S1 = (rsq_S1 < rc2_S);
 312     wco_S2 = (rsq_S2 < rc2_S);
 313     wco_S3 = (rsq_S3 < rc2_S);
 314
 315 #    ifdef CHECK_EXCLS
 316 #        ifdef EXCL_FORCES
 317     /* Only remove the (sub-)diagonal to avoid double counting */
 318 #            if UNROLLJ == UNROLLI
 319     if (cj == ci_sh)
 320     {
 321         wco_S0 = wco_S0 && diagonal_mask_S0;
 322         wco_S1 = wco_S1 && diagonal_mask_S1;
 323         wco_S2 = wco_S2 && diagonal_mask_S2;
 324         wco_S3 = wco_S3 && diagonal_mask_S3;
 325     }
 326 #            else
 327 #                if UNROLLJ < UNROLLI
 328     if (cj == ci_sh * 2)
 329     {
 330         wco_S0 = wco_S0 && diagonal_mask0_S0;
 331         wco_S1 = wco_S1 && diagonal_mask0_S1;
 332         wco_S2 = wco_S2 && diagonal_mask0_S2;
 333         wco_S3 = wco_S3 && diagonal_mask0_S3;
 334     }
 335     if (cj == ci_sh * 2 + 1)
 336     {
 337         wco_S0 = wco_S0 && diagonal_mask1_S0;
 338         wco_S1 = wco_S1 && diagonal_mask1_S1;
 339         wco_S2 = wco_S2 && diagonal_mask1_S2;
 340         wco_S3 = wco_S3 && diagonal_mask1_S3;
 341     }
 342 #                else
 343     if (cj * 2 == ci_sh)
 344     {
 345         wco_S0 = wco_S0 && diagonal_mask0_S0;
 346         wco_S1 = wco_S1 && diagonal_mask0_S1;
 347         wco_S2 = wco_S2 && diagonal_mask0_S2;
 348         wco_S3 = wco_S3 && diagonal_mask0_S3;
 349     }
 350     else if (cj * 2 + 1 == ci_sh)
 351     {
 352         wco_S0 = wco_S0 && diagonal_mask1_S0;
 353         wco_S1 = wco_S1 && diagonal_mask1_S1;
 354         wco_S2 = wco_S2 && diagonal_mask1_S2;
 355         wco_S3 = wco_S3 && diagonal_mask1_S3;
 356     }
 357 #                endif
 358 #            endif
 359 #        else /* EXCL_FORCES */
 360     /* No exclusion forces: remove all excluded atom pairs from the list */
 361     wco_S0 = wco_S0 && interact_S0;
 362     wco_S1 = wco_S1 && interact_S1;
 363     wco_S2 = wco_S2 && interact_S2;
 364     wco_S3 = wco_S3 && interact_S3;
 365 #        endif
 366 #    endif
 367
 368 #    ifdef COUNT_PAIRS
 369     {
 370         int                              i, j;
 371         alignas(GMX_SIMD_ALIGNMENT) real tmp[2 * GMX_SIMD_REAL_WIDTH];
 372
 373         for (i = 0; i < UNROLLI; i++)
 374         {
 375             store(tmp, rc2_S - (i == 0 ? rsq_S0 : (i == 1 ? rsq_S1 : (i == 2 ? rsq_S2 : rsq_S3))));
 376             for (j = 0; j < UNROLLJ; j++)
 377             {
 378                 if (tmp[j] >= 0)
 379                 {
 380                     npair++;
 381                 }
 382             }
 383         }
 384     }
 385 #    endif
 386
 387     // Ensure the distances do not fall below the limit where r^-12 overflows.
 388     // This should never happen for normal interactions.
 389     rsq_S0 = max(rsq_S0, minRsq_S);
 390     rsq_S1 = max(rsq_S1, minRsq_S);
 391     rsq_S2 = max(rsq_S2, minRsq_S);
 392     rsq_S3 = max(rsq_S3, minRsq_S);
 393
 394     /* Calculate 1/r */
 395 #    if !GMX_DOUBLE
 396     rinv_S0 = invsqrt(rsq_S0);
 397     rinv_S1 = invsqrt(rsq_S1);
 398     rinv_S2 = invsqrt(rsq_S2);
 399     rinv_S3 = invsqrt(rsq_S3);
 400 #    else
 401     invsqrtPair(rsq_S0, rsq_S1, &rinv_S0, &rinv_S1);
 402     invsqrtPair(rsq_S2, rsq_S3, &rinv_S2, &rinv_S3);
 403 #    endif
 404
 405 #    ifdef CALC_COULOMB
 406     /* Load parameters for j atom */
 407     jq_S  = load<SimdReal>(q + aj);
 408     qq_S0 = iq_S0 * jq_S;
 409     qq_S1 = iq_S1 * jq_S;
 410     qq_S2 = iq_S2 * jq_S;
 411     qq_S3 = iq_S3 * jq_S;
 412 #    endif
 413
 414 #    ifdef CALC_LJ
 415 #        if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
 416     SimdReal c6_S0, c6_S1, c12_S0, c12_S1;
 417     gatherLoadTranspose<c_simdBestPairAlignment>(nbfp0, type + aj, &c6_S0, &c12_S0);
 418     gatherLoadTranspose<c_simdBestPairAlignment>(nbfp1, type + aj, &c6_S1, &c12_S1);
 419 #            ifndef HALF_LJ
 420     SimdReal c6_S2, c6_S3, c12_S2, c12_S3;
 421     gatherLoadTranspose<c_simdBestPairAlignment>(nbfp2, type + aj, &c6_S2, &c12_S2);
 422     gatherLoadTranspose<c_simdBestPairAlignment>(nbfp3, type + aj, &c6_S3, &c12_S3);
 423 #            endif
 424 #        endif /* not defined any LJ rule */
 425
 426 #        ifdef LJ_COMB_GEOM
 427     c6s_j_S        = load<SimdReal>(ljc + aj2 + 0);
 428     c12s_j_S       = load<SimdReal>(ljc + aj2 + STRIDE);
 429     SimdReal c6_S0 = c6s_S0 * c6s_j_S;
 430     SimdReal c6_S1 = c6s_S1 * c6s_j_S;
 431 #            ifndef HALF_LJ
 432     SimdReal c6_S2 = c6s_S2 * c6s_j_S;
 433     SimdReal c6_S3 = c6s_S3 * c6s_j_S;
 434 #            endif
 435     SimdReal c12_S0 = c12s_S0 * c12s_j_S;
 436     SimdReal c12_S1 = c12s_S1 * c12s_j_S;
 437 #            ifndef HALF_LJ
 438     SimdReal c12_S2 = c12s_S2 * c12s_j_S;
 439     SimdReal c12_S3 = c12s_S3 * c12s_j_S;
 440 #            endif
 441 #        endif /* LJ_COMB_GEOM */
 442
 443 #        ifdef LJ_COMB_LB
 444     hsig_j_S = load<SimdReal>(ljc + aj2 + 0);
 445     seps_j_S = load<SimdReal>(ljc + aj2 + STRIDE);
 446
 447     sig_S0 = hsig_i_S0 + hsig_j_S;
 448     sig_S1 = hsig_i_S1 + hsig_j_S;
 449     eps_S0 = seps_i_S0 * seps_j_S;
 450     eps_S1 = seps_i_S1 * seps_j_S;
 451 #            ifndef HALF_LJ
 452     sig_S2 = hsig_i_S2 + hsig_j_S;
 453     sig_S3 = hsig_i_S3 + hsig_j_S;
 454     eps_S2 = seps_i_S2 * seps_j_S;
 455     eps_S3 = seps_i_S3 * seps_j_S;
 456 #            endif
 457 #        endif /* LJ_COMB_LB */
 458
 459 #    endif /* CALC_LJ */
 460
 461     /* Set rinv to zero for r beyond the cut-off */
 462     rinv_S0 = selectByMask(rinv_S0, wco_S0);
 463     rinv_S1 = selectByMask(rinv_S1, wco_S1);
 464     rinv_S2 = selectByMask(rinv_S2, wco_S2);
 465     rinv_S3 = selectByMask(rinv_S3, wco_S3);
 466
 467     rinvsq_S0 = rinv_S0 * rinv_S0;
 468     rinvsq_S1 = rinv_S1 * rinv_S1;
 469     rinvsq_S2 = rinv_S2 * rinv_S2;
 470     rinvsq_S3 = rinv_S3 * rinv_S3;
 471
 472 #    ifdef CALC_COULOMB
 473     /* Note that here we calculate force*r, not the usual force/r.
 474      * This allows avoiding masking the reaction-field contribution,
 475      * as frcoul is later multiplied by rinvsq which has been
 476      * masked with the cut-off check.
 477      */
 478
 479 #        ifdef EXCL_FORCES
 480     /* Only add 1/r for non-excluded atom pairs */
 481     rinv_ex_S0 = selectByMask(rinv_S0, interact_S0);
 482     rinv_ex_S1 = selectByMask(rinv_S1, interact_S1);
 483     rinv_ex_S2 = selectByMask(rinv_S2, interact_S2);
 484     rinv_ex_S3 = selectByMask(rinv_S3, interact_S3);
 485 #        else
 486     /* No exclusion forces, we always need 1/r */
 487 #            define rinv_ex_S0 rinv_S0
 488 #            define rinv_ex_S1 rinv_S1
 489 #            define rinv_ex_S2 rinv_S2
 490 #            define rinv_ex_S3 rinv_S3
 491 #        endif
 492
 493 #        ifdef CALC_COUL_RF
 494     /* Electrostatic interactions */
 495     frcoul_S0 = qq_S0 * fma(rsq_S0, mrc_3_S, rinv_ex_S0);
 496     frcoul_S1 = qq_S1 * fma(rsq_S1, mrc_3_S, rinv_ex_S1);
 497     frcoul_S2 = qq_S2 * fma(rsq_S2, mrc_3_S, rinv_ex_S2);
 498     frcoul_S3 = qq_S3 * fma(rsq_S3, mrc_3_S, rinv_ex_S3);
 499
 500 #            ifdef CALC_ENERGIES
 501     vcoul_S0 = qq_S0 * (rinv_ex_S0 + fma(rsq_S0, hrc_3_S, moh_rc_S));
 502     vcoul_S1 = qq_S1 * (rinv_ex_S1 + fma(rsq_S1, hrc_3_S, moh_rc_S));
 503     vcoul_S2 = qq_S2 * (rinv_ex_S2 + fma(rsq_S2, hrc_3_S, moh_rc_S));
 504     vcoul_S3 = qq_S3 * (rinv_ex_S3 + fma(rsq_S3, hrc_3_S, moh_rc_S));
 505 #            endif
 506 #        endif
 507
 508 #        ifdef CALC_COUL_EWALD
 509     /* We need to mask (or limit) rsq for the cut-off,
 510      * as large distances can cause an overflow in gmx_pmecorrF/V.
 511      */
 512     brsq_S0   = beta2_S * selectByMask(rsq_S0, wco_S0);
 513     brsq_S1   = beta2_S * selectByMask(rsq_S1, wco_S1);
 514     brsq_S2   = beta2_S * selectByMask(rsq_S2, wco_S2);
 515     brsq_S3   = beta2_S * selectByMask(rsq_S3, wco_S3);
 516     ewcorr_S0 = beta_S * pmeForceCorrection(brsq_S0);
 517     ewcorr_S1 = beta_S * pmeForceCorrection(brsq_S1);
 518     ewcorr_S2 = beta_S * pmeForceCorrection(brsq_S2);
 519     ewcorr_S3 = beta_S * pmeForceCorrection(brsq_S3);
 520     frcoul_S0 = qq_S0 * fma(ewcorr_S0, brsq_S0, rinv_ex_S0);
 521     frcoul_S1 = qq_S1 * fma(ewcorr_S1, brsq_S1, rinv_ex_S1);
 522     frcoul_S2 = qq_S2 * fma(ewcorr_S2, brsq_S2, rinv_ex_S2);
 523     frcoul_S3 = qq_S3 * fma(ewcorr_S3, brsq_S3, rinv_ex_S3);
 524
 525 #            ifdef CALC_ENERGIES
 526     vc_sub_S0 = beta_S * pmePotentialCorrection(brsq_S0);
 527     vc_sub_S1 = beta_S * pmePotentialCorrection(brsq_S1);
 528     vc_sub_S2 = beta_S * pmePotentialCorrection(brsq_S2);
 529     vc_sub_S3 = beta_S * pmePotentialCorrection(brsq_S3);
 530 #            endif
 531
 532 #        endif /* CALC_COUL_EWALD */
 533
 534 #        ifdef CALC_COUL_TAB
 535     /* Electrostatic interactions */
 536     r_S0 = rsq_S0 * rinv_S0;
 537     r_S1 = rsq_S1 * rinv_S1;
 538     r_S2 = rsq_S2 * rinv_S2;
 539     r_S3 = rsq_S3 * rinv_S3;
 540     /* Convert r to scaled table units */
 541     rs_S0 = r_S0 * invtsp_S;
 542     rs_S1 = r_S1 * invtsp_S;
 543     rs_S2 = r_S2 * invtsp_S;
 544     rs_S3 = r_S3 * invtsp_S;
 545     /* Truncate scaled r to an int */
 546     ti_S0 = cvttR2I(rs_S0);
 547     ti_S1 = cvttR2I(rs_S1);
 548     ti_S2 = cvttR2I(rs_S2);
 549     ti_S3 = cvttR2I(rs_S3);
 550
 551     rf_S0 = trunc(rs_S0);
 552     rf_S1 = trunc(rs_S1);
 553     rf_S2 = trunc(rs_S2);
 554     rf_S3 = trunc(rs_S3);
 555
 556     frac_S0 = rs_S0 - rf_S0;
 557     frac_S1 = rs_S1 - rf_S1;
 558     frac_S2 = rs_S2 - rf_S2;
 559     frac_S3 = rs_S3 - rf_S3;
 560
 561     /* Load and interpolate table forces and possibly energies.
 562      * Force and energy can be combined in one table, stride 4: FDV0
 563      * or in two separate tables with stride 1: F and V
 564      * Currently single precision uses FDV0, double F and V.
 565      */
 566 #            ifndef CALC_ENERGIES
 567 #                ifdef TAB_FDV0
 568     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
 569     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S1, &ctab0_S1, &ctab1_S1);
 570     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
 571     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S3, &ctab0_S3, &ctab1_S3);
 572 #                else
 573     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
 574     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S1, &ctab0_S1, &ctab1_S1);
 575     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
 576     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S3, &ctab0_S3, &ctab1_S3);
 577     ctab1_S0  = ctab1_S0 - ctab0_S0;
 578     ctab1_S1  = ctab1_S1 - ctab0_S1;
 579     ctab1_S2  = ctab1_S2 - ctab0_S2;
 580     ctab1_S3  = ctab1_S3 - ctab0_S3;
 581 #                endif
 582 #            else
 583 #                ifdef TAB_FDV0
 584     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0, &ctabv_S0, &dum_S0);
 585     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S1, &ctab0_S1, &ctab1_S1, &ctabv_S1, &dum_S1);
 586     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2, &ctabv_S2, &dum_S2);
 587     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S3, &ctab0_S3, &ctab1_S3, &ctabv_S3, &dum_S3);
 588 #                else
 589     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
 590     gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S0, &ctabv_S0, &dum_S0);
 591     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S1, &ctab0_S1, &ctab1_S1);
 592     gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S1, &ctabv_S1, &dum_S1);
 593     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
 594     gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S2, &ctabv_S2, &dum_S2);
 595     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S3, &ctab0_S3, &ctab1_S3);
 596     gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S3, &ctabv_S3, &dum_S3);
 597     ctab1_S0 = ctab1_S0 - ctab0_S0;
 598     ctab1_S1 = ctab1_S1 - ctab0_S1;
 599     ctab1_S2 = ctab1_S2 - ctab0_S2;
 600     ctab1_S3 = ctab1_S3 - ctab0_S3;
 601 #                endif
 602 #            endif
 603     fsub_S0   = fma(frac_S0, ctab1_S0, ctab0_S0);
 604     fsub_S1   = fma(frac_S1, ctab1_S1, ctab0_S1);
 605     fsub_S2   = fma(frac_S2, ctab1_S2, ctab0_S2);
 606     fsub_S3   = fma(frac_S3, ctab1_S3, ctab0_S3);
 607     frcoul_S0 = qq_S0 * fnma(fsub_S0, r_S0, rinv_ex_S0);
 608     frcoul_S1 = qq_S1 * fnma(fsub_S1, r_S1, rinv_ex_S1);
 609     frcoul_S2 = qq_S2 * fnma(fsub_S2, r_S2, rinv_ex_S2);
 610     frcoul_S3 = qq_S3 * fnma(fsub_S3, r_S3, rinv_ex_S3);
 611
 612 #            ifdef CALC_ENERGIES
 613     vc_sub_S0 = fma((mhalfsp_S * frac_S0), (ctab0_S0 + fsub_S0), ctabv_S0);
 614     vc_sub_S1 = fma((mhalfsp_S * frac_S1), (ctab0_S1 + fsub_S1), ctabv_S1);
 615     vc_sub_S2 = fma((mhalfsp_S * frac_S2), (ctab0_S2 + fsub_S2), ctabv_S2);
 616     vc_sub_S3 = fma((mhalfsp_S * frac_S3), (ctab0_S3 + fsub_S3), ctabv_S3);
 617 #            endif
 618 #        endif /* CALC_COUL_TAB */
 619
 620 #        if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 621 #            ifndef NO_SHIFT_EWALD
 622     /* Add Ewald potential shift to vc_sub for convenience */
 623 #                ifdef CHECK_EXCLS
 624     vc_sub_S0 = vc_sub_S0 + selectByMask(sh_ewald_S, interact_S0);
 625     vc_sub_S1 = vc_sub_S1 + selectByMask(sh_ewald_S, interact_S1);
 626     vc_sub_S2 = vc_sub_S2 + selectByMask(sh_ewald_S, interact_S2);
 627     vc_sub_S3 = vc_sub_S3 + selectByMask(sh_ewald_S, interact_S3);
 628 #                else
 629     vc_sub_S0 = vc_sub_S0 + sh_ewald_S;
 630     vc_sub_S1 = vc_sub_S1 + sh_ewald_S;
 631     vc_sub_S2 = vc_sub_S2 + sh_ewald_S;
 632     vc_sub_S3 = vc_sub_S3 + sh_ewald_S;
 633 #                endif
 634 #            endif
 635
 636     vcoul_S0 = qq_S0 * (rinv_ex_S0 - vc_sub_S0);
 637     vcoul_S1 = qq_S1 * (rinv_ex_S1 - vc_sub_S1);
 638     vcoul_S2 = qq_S2 * (rinv_ex_S2 - vc_sub_S2);
 639     vcoul_S3 = qq_S3 * (rinv_ex_S3 - vc_sub_S3);
 640
 641 #        endif
 642
 643 #        ifdef CALC_ENERGIES
 644     /* Mask energy for cut-off and diagonal */
 645     vcoul_S0 = selectByMask(vcoul_S0, wco_S0);
 646     vcoul_S1 = selectByMask(vcoul_S1, wco_S1);
 647     vcoul_S2 = selectByMask(vcoul_S2, wco_S2);
 648     vcoul_S3 = selectByMask(vcoul_S3, wco_S3);
 649 #        endif
 650
 651 #    endif /* CALC_COULOMB */
 652
 653 #    ifdef CALC_LJ
 654     /* Lennard-Jones interaction */
 655
 656 #        ifdef VDW_CUTOFF_CHECK
 657     wco_vdw_S0 = (rsq_S0 < rcvdw2_S);
 658     wco_vdw_S1 = (rsq_S1 < rcvdw2_S);
 659 #            ifndef HALF_LJ
 660     wco_vdw_S2 = (rsq_S2 < rcvdw2_S);
 661     wco_vdw_S3 = (rsq_S3 < rcvdw2_S);
 662 #            endif
 663 #        else
 664     /* Same cut-off for Coulomb and VdW, reuse the registers */
 665 #            define wco_vdw_S0 wco_S0
 666 #            define wco_vdw_S1 wco_S1
 667 #            define wco_vdw_S2 wco_S2
 668 #            define wco_vdw_S3 wco_S3
 669 #        endif
 670
 671 #        ifndef LJ_COMB_LB
 672     rinvsix_S0 = rinvsq_S0 * rinvsq_S0 * rinvsq_S0;
 673     rinvsix_S1 = rinvsq_S1 * rinvsq_S1 * rinvsq_S1;
 674 #            ifdef EXCL_FORCES
 675     rinvsix_S0 = selectByMask(rinvsix_S0, interact_S0);
 676     rinvsix_S1 = selectByMask(rinvsix_S1, interact_S1);
 677 #            endif
 678 #            ifndef HALF_LJ
 679     rinvsix_S2 = rinvsq_S2 * rinvsq_S2 * rinvsq_S2;
 680     rinvsix_S3 = rinvsq_S3 * rinvsq_S3 * rinvsq_S3;
 681 #                ifdef EXCL_FORCES
 682     rinvsix_S2 = selectByMask(rinvsix_S2, interact_S2);
 683     rinvsix_S3 = selectByMask(rinvsix_S3, interact_S3);
 684 #                endif
 685 #            endif
 686
 687 #            if defined LJ_CUT || defined LJ_POT_SWITCH
 688     /* We have plain LJ or LJ-PME with simple C6/6 C12/12 coefficients */
 689     FrLJ6_S0 = c6_S0 * rinvsix_S0;
 690     FrLJ6_S1 = c6_S1 * rinvsix_S1;
 691 #                ifndef HALF_LJ
 692     FrLJ6_S2 = c6_S2 * rinvsix_S2;
 693     FrLJ6_S3 = c6_S3 * rinvsix_S3;
 694 #                endif
 695     FrLJ12_S0 = c12_S0 * rinvsix_S0 * rinvsix_S0;
 696     FrLJ12_S1 = c12_S1 * rinvsix_S1 * rinvsix_S1;
 697 #                ifndef HALF_LJ
 698     FrLJ12_S2 = c12_S2 * rinvsix_S2 * rinvsix_S2;
 699     FrLJ12_S3 = c12_S3 * rinvsix_S3 * rinvsix_S3;
 700 #                endif
 701 #            endif
 702
 703 #            if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
 704     /* We switch the LJ force */
 705     r_S0    = rsq_S0 * rinv_S0;
 706     rsw_S0  = max(r_S0 - rswitch_S, zero_S);
 707     rsw2_S0 = rsw_S0 * rsw_S0;
 708     r_S1    = rsq_S1 * rinv_S1;
 709     rsw_S1  = max(r_S1 - rswitch_S, zero_S);
 710     rsw2_S1 = rsw_S1 * rsw_S1;
 711 #                ifndef HALF_LJ
 712     r_S2    = rsq_S2 * rinv_S2;
 713     rsw_S2  = max(r_S2 - rswitch_S, zero_S);
 714     rsw2_S2 = rsw_S2 * rsw_S2;
 715     r_S3    = rsq_S3 * rinv_S3;
 716     rsw_S3  = max(r_S3 - rswitch_S, zero_S);
 717     rsw2_S3 = rsw_S3 * rsw_S3;
 718 #                endif
 719 #            endif
 720
 721 #            ifdef LJ_FORCE_SWITCH
 722
 723 #                define gmx_add_fr_switch(fr, rsw, rsw2_r, c2, c3) \
 724                     fma(fma(c3, rsw, c2), rsw2_r, (fr))
 725     SimdReal rsw2_r_S0 = rsw2_S0 * r_S0;
 726     SimdReal rsw2_r_S1 = rsw2_S1 * r_S1;
 727     FrLJ6_S0 = c6_S0 * gmx_add_fr_switch(rinvsix_S0, rsw_S0, rsw2_r_S0, p6_fc2_S, p6_fc3_S);
 728     FrLJ6_S1 = c6_S1 * gmx_add_fr_switch(rinvsix_S1, rsw_S1, rsw2_r_S1, p6_fc2_S, p6_fc3_S);
 729 #                ifndef HALF_LJ
 730     SimdReal rsw2_r_S2 = rsw2_S2 * r_S2;
 731     SimdReal rsw2_r_S3 = rsw2_S3 * r_S3;
 732     FrLJ6_S2 = c6_S2 * gmx_add_fr_switch(rinvsix_S2, rsw_S2, rsw2_r_S2, p6_fc2_S, p6_fc3_S);
 733     FrLJ6_S3 = c6_S3 * gmx_add_fr_switch(rinvsix_S3, rsw_S3, rsw2_r_S3, p6_fc2_S, p6_fc3_S);
 734 #                endif
 735     FrLJ12_S0 = c12_S0 * gmx_add_fr_switch((rinvsix_S0 * rinvsix_S0), rsw_S0, rsw2_r_S0, p12_fc2_S, p12_fc3_S);
 736     FrLJ12_S1 = c12_S1 * gmx_add_fr_switch((rinvsix_S1 * rinvsix_S1), rsw_S1, rsw2_r_S1, p12_fc2_S, p12_fc3_S);
 737 #                ifndef HALF_LJ
 738     FrLJ12_S2 = c12_S2 * gmx_add_fr_switch((rinvsix_S2 * rinvsix_S2), rsw_S2, rsw2_r_S2, p12_fc2_S, p12_fc3_S);
 739     FrLJ12_S3 = c12_S3 * gmx_add_fr_switch((rinvsix_S3 * rinvsix_S3), rsw_S3, rsw2_r_S3, p12_fc2_S, p12_fc3_S);
 740 #                endif
 741 #                undef gmx_add_fr_switch
 742 #            endif /* LJ_FORCE_SWITCH */
 743
 744 #        endif /* not LJ_COMB_LB */
 745
 746 #        ifdef LJ_COMB_LB
 747     sir_S0 = sig_S0 * rinv_S0;
 748     sir_S1 = sig_S1 * rinv_S1;
 749 #            ifndef HALF_LJ
 750     sir_S2 = sig_S2 * rinv_S2;
 751     sir_S3 = sig_S3 * rinv_S3;
 752 #            endif
 753     sir2_S0 = sir_S0 * sir_S0;
 754     sir2_S1 = sir_S1 * sir_S1;
 755 #            ifndef HALF_LJ
 756     sir2_S2 = sir_S2 * sir_S2;
 757     sir2_S3 = sir_S3 * sir_S3;
 758 #            endif
 759     sir6_S0 = sir2_S0 * sir2_S0 * sir2_S0;
 760     sir6_S1 = sir2_S1 * sir2_S1 * sir2_S1;
 761 #            ifdef EXCL_FORCES
 762     sir6_S0 = selectByMask(sir6_S0, interact_S0);
 763     sir6_S1 = selectByMask(sir6_S1, interact_S1);
 764 #            endif
 765 #            ifndef HALF_LJ
 766     sir6_S2 = sir2_S2 * sir2_S2 * sir2_S2;
 767     sir6_S3 = sir2_S3 * sir2_S3 * sir2_S3;
 768 #                ifdef EXCL_FORCES
 769     sir6_S2 = selectByMask(sir6_S2, interact_S2);
 770     sir6_S3 = selectByMask(sir6_S3, interact_S3);
 771 #                endif
 772 #            endif
 773 #            ifdef VDW_CUTOFF_CHECK
 774     sir6_S0 = selectByMask(sir6_S0, wco_vdw_S0);
 775     sir6_S1 = selectByMask(sir6_S1, wco_vdw_S1);
 776 #                ifndef HALF_LJ
 777     sir6_S2 = selectByMask(sir6_S2, wco_vdw_S2);
 778     sir6_S3 = selectByMask(sir6_S3, wco_vdw_S3);
 779 #                endif
 780 #            endif
 781     FrLJ6_S0 = eps_S0 * sir6_S0;
 782     FrLJ6_S1 = eps_S1 * sir6_S1;
 783 #            ifndef HALF_LJ
 784     FrLJ6_S2 = eps_S2 * sir6_S2;
 785     FrLJ6_S3 = eps_S3 * sir6_S3;
 786 #            endif
 787     FrLJ12_S0 = FrLJ6_S0 * sir6_S0;
 788     FrLJ12_S1 = FrLJ6_S1 * sir6_S1;
 789 #            ifndef HALF_LJ
 790     FrLJ12_S2 = FrLJ6_S2 * sir6_S2;
 791     FrLJ12_S3 = FrLJ6_S3 * sir6_S3;
 792 #            endif
 793 #            if defined CALC_ENERGIES
 794     /* We need C6 and C12 to calculate the LJ potential shift */
 795     sig2_S0 = sig_S0 * sig_S0;
 796     sig2_S1 = sig_S1 * sig_S1;
 797 #                ifndef HALF_LJ
 798     sig2_S2 = sig_S2 * sig_S2;
 799     sig2_S3 = sig_S3 * sig_S3;
 800 #                endif
 801     sig6_S0 = sig2_S0 * sig2_S0 * sig2_S0;
 802     sig6_S1 = sig2_S1 * sig2_S1 * sig2_S1;
 803 #                ifndef HALF_LJ
 804     sig6_S2 = sig2_S2 * sig2_S2 * sig2_S2;
 805     sig6_S3 = sig2_S3 * sig2_S3 * sig2_S3;
 806 #                endif
 807     SimdReal c6_S0 = eps_S0 * sig6_S0;
 808     SimdReal c6_S1 = eps_S1 * sig6_S1;
 809 #                ifndef HALF_LJ
 810     SimdReal c6_S2 = eps_S2 * sig6_S2;
 811     SimdReal c6_S3 = eps_S3 * sig6_S3;
 812 #                endif
 813     SimdReal c12_S0 = c6_S0 * sig6_S0;
 814     SimdReal c12_S1 = c6_S1 * sig6_S1;
 815 #                ifndef HALF_LJ
 816     SimdReal c12_S2 = c6_S2 * sig6_S2;
 817     SimdReal c12_S3 = c6_S3 * sig6_S3;
 818 #                endif
 819 #            endif
 820 #        endif /* LJ_COMB_LB */
 821
 822     /* Determine the total scalar LJ force*r */
 823     frLJ_S0 = FrLJ12_S0 - FrLJ6_S0;
 824     frLJ_S1 = FrLJ12_S1 - FrLJ6_S1;
 825 #        ifndef HALF_LJ
 826     frLJ_S2 = FrLJ12_S2 - FrLJ6_S2;
 827     frLJ_S3 = FrLJ12_S3 - FrLJ6_S3;
 828 #        endif
 829
 830 #        if (defined LJ_CUT || defined LJ_FORCE_SWITCH) && defined CALC_ENERGIES
 831
 832 #            ifdef LJ_CUT
 833     /* Calculate the LJ energies, with constant potential shift */
 834     SimdReal VLJ6_S0 = sixth_S * fma(c6_S0, p6_cpot_S, FrLJ6_S0);
 835     SimdReal VLJ6_S1 = sixth_S * fma(c6_S1, p6_cpot_S, FrLJ6_S1);
 836 #                ifndef HALF_LJ
 837     SimdReal VLJ6_S2 = sixth_S * fma(c6_S2, p6_cpot_S, FrLJ6_S2);
 838     SimdReal VLJ6_S3 = sixth_S * fma(c6_S3, p6_cpot_S, FrLJ6_S3);
 839 #                endif
 840     SimdReal VLJ12_S0 = twelveth_S * fma(c12_S0, p12_cpot_S, FrLJ12_S0);
 841     SimdReal VLJ12_S1 = twelveth_S * fma(c12_S1, p12_cpot_S, FrLJ12_S1);
 842 #                ifndef HALF_LJ
 843     SimdReal VLJ12_S2 = twelveth_S * fma(c12_S2, p12_cpot_S, FrLJ12_S2);
 844     SimdReal VLJ12_S3 = twelveth_S * fma(c12_S3, p12_cpot_S, FrLJ12_S3);
 845 #                endif
 846 #            endif /* LJ_CUT */
 847
 848 #            ifdef LJ_FORCE_SWITCH
 849 #                define v_fswitch_r(rsw, rsw2, c0, c3, c4) \
 850                     fma(fma((c4), (rsw), (c3)), ((rsw2) * (rsw)), (c0))
 851
 852     SimdReal VLJ6_S0 =
 853             c6_S0 * fma(sixth_S, rinvsix_S0, v_fswitch_r(rsw_S0, rsw2_S0, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
 854     SimdReal VLJ6_S1 =
 855             c6_S1 * fma(sixth_S, rinvsix_S1, v_fswitch_r(rsw_S1, rsw2_S1, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
 856 #                ifndef HALF_LJ
 857     SimdReal VLJ6_S2 =
 858             c6_S2 * fma(sixth_S, rinvsix_S2, v_fswitch_r(rsw_S2, rsw2_S2, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
 859     SimdReal VLJ6_S3 =
 860             c6_S3 * fma(sixth_S, rinvsix_S3, v_fswitch_r(rsw_S3, rsw2_S3, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
 861 #                endif
 862     SimdReal VLJ12_S0 = c12_S0
 863                         * fma(twelveth_S,
 864                               rinvsix_S0 * rinvsix_S0,
 865                               v_fswitch_r(rsw_S0, rsw2_S0, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
 866     SimdReal VLJ12_S1 = c12_S1
 867                         * fma(twelveth_S,
 868                               rinvsix_S1 * rinvsix_S1,
 869                               v_fswitch_r(rsw_S1, rsw2_S1, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
 870 #                ifndef HALF_LJ
 871     SimdReal VLJ12_S2 = c12_S2
 872                         * fma(twelveth_S,
 873                               rinvsix_S2 * rinvsix_S2,
 874                               v_fswitch_r(rsw_S2, rsw2_S2, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
 875     SimdReal VLJ12_S3 = c12_S3
 876                         * fma(twelveth_S,
 877                               rinvsix_S3 * rinvsix_S3,
 878                               v_fswitch_r(rsw_S3, rsw2_S3, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
 879 #                endif
 880 #                undef v_fswitch_r
 881 #            endif /* LJ_FORCE_SWITCH */
 882
 883     /* Add up the repulsion and dispersion */
 884     SimdReal VLJ_S0 = VLJ12_S0 - VLJ6_S0;
 885     SimdReal VLJ_S1 = VLJ12_S1 - VLJ6_S1;
 886 #            ifndef HALF_LJ
 887     SimdReal VLJ_S2 = VLJ12_S2 - VLJ6_S2;
 888     SimdReal VLJ_S3 = VLJ12_S3 - VLJ6_S3;
 889 #            endif
 890
 891 #        endif /* (LJ_CUT || LJ_FORCE_SWITCH) && CALC_ENERGIES */
 892
 893 #        ifdef LJ_POT_SWITCH
 894     /* We always need the potential, since it is needed for the force */
 895     SimdReal VLJ_S0 = fnma(sixth_S, FrLJ6_S0, twelveth_S * FrLJ12_S0);
 896     SimdReal VLJ_S1 = fnma(sixth_S, FrLJ6_S1, twelveth_S * FrLJ12_S1);
 897 #            ifndef HALF_LJ
 898     SimdReal VLJ_S2 = fnma(sixth_S, FrLJ6_S2, twelveth_S * FrLJ12_S2);
 899     SimdReal VLJ_S3 = fnma(sixth_S, FrLJ6_S3, twelveth_S * FrLJ12_S3);
 900 #            endif
 901
 902     {
 903         SimdReal sw_S0, dsw_S0;
 904         SimdReal sw_S1, dsw_S1;
 905 #            ifndef HALF_LJ
 906         SimdReal sw_S2, dsw_S2;
 907         SimdReal sw_S3, dsw_S3;
 908 #            endif
 909
 910 #            define switch_r(rsw, rsw2, c3, c4, c5) \
 911                 fma(fma(fma(c5, rsw, c4), rsw, c3), ((rsw2) * (rsw)), one_S)
 912 #            define dswitch_r(rsw, rsw2, c2, c3, c4) (fma(fma(c4, rsw, c3), rsw, c2) * (rsw2))
 913
 914         sw_S0  = switch_r(rsw_S0, rsw2_S0, swV3_S, swV4_S, swV5_S);
 915         dsw_S0 = dswitch_r(rsw_S0, rsw2_S0, swF2_S, swF3_S, swF4_S);
 916         sw_S1  = switch_r(rsw_S1, rsw2_S1, swV3_S, swV4_S, swV5_S);
 917         dsw_S1 = dswitch_r(rsw_S1, rsw2_S1, swF2_S, swF3_S, swF4_S);
 918 #            ifndef HALF_LJ
 919         sw_S2  = switch_r(rsw_S2, rsw2_S2, swV3_S, swV4_S, swV5_S);
 920         dsw_S2 = dswitch_r(rsw_S2, rsw2_S2, swF2_S, swF3_S, swF4_S);
 921         sw_S3  = switch_r(rsw_S3, rsw2_S3, swV3_S, swV4_S, swV5_S);
 922         dsw_S3 = dswitch_r(rsw_S3, rsw2_S3, swF2_S, swF3_S, swF4_S);
 923 #            endif
 924         frLJ_S0 = fnma(dsw_S0 * VLJ_S0, r_S0, sw_S0 * frLJ_S0);
 925         frLJ_S1 = fnma(dsw_S1 * VLJ_S1, r_S1, sw_S1 * frLJ_S1);
 926 #            ifndef HALF_LJ
 927         frLJ_S2 = fnma(dsw_S2 * VLJ_S2, r_S2, sw_S2 * frLJ_S2);
 928         frLJ_S3 = fnma(dsw_S3 * VLJ_S3, r_S3, sw_S3 * frLJ_S3);
 929 #            endif
 930 #            ifdef CALC_ENERGIES
 931         VLJ_S0 = sw_S0 * VLJ_S0;
 932         VLJ_S1 = sw_S1 * VLJ_S1;
 933 #                ifndef HALF_LJ
 934         VLJ_S2 = sw_S2 * VLJ_S2;
 935         VLJ_S3 = sw_S3 * VLJ_S3;
 936 #                endif
 937 #            endif
 938
 939 #            undef switch_r
 940 #            undef dswitch_r
 941     }
 942 #        endif /* LJ_POT_SWITCH */
 943
 944 #        if defined CALC_ENERGIES && defined CHECK_EXCLS
 945     /* The potential shift should be removed for excluded pairs */
 946     VLJ_S0 = selectByMask(VLJ_S0, interact_S0);
 947     VLJ_S1 = selectByMask(VLJ_S1, interact_S1);
 948 #            ifndef HALF_LJ
 949     VLJ_S2 = selectByMask(VLJ_S2, interact_S2);
 950     VLJ_S3 = selectByMask(VLJ_S3, interact_S3);
 951 #            endif
 952 #        endif
 953
 954 #        ifdef LJ_EWALD_GEOM
 955     {
 956         SimdReal c6s_j_S;
 957         SimdReal c6grid_S0, rinvsix_nm_S0, cr2_S0, expmcr2_S0, poly_S0;
 958         SimdReal c6grid_S1, rinvsix_nm_S1, cr2_S1, expmcr2_S1, poly_S1;
 959 #            ifndef HALF_LJ
 960         SimdReal c6grid_S2, rinvsix_nm_S2, cr2_S2, expmcr2_S2, poly_S2;
 961         SimdReal c6grid_S3, rinvsix_nm_S3, cr2_S3, expmcr2_S3, poly_S3;
 962 #            endif
 963 #            ifdef CALC_ENERGIES
 964         SimdReal sh_mask_S0;
 965         SimdReal sh_mask_S1;
 966 #                ifndef HALF_LJ
 967         SimdReal sh_mask_S2;
 968         SimdReal sh_mask_S3;
 969 #                endif
 970 #            endif
 971
 972         /* Determine C6 for the grid using the geometric combination rule */
 973         c6s_j_S   = load<SimdReal>(ljc + aj2 + 0);
 974         c6grid_S0 = c6s_S0 * c6s_j_S;
 975         c6grid_S1 = c6s_S1 * c6s_j_S;
 976 #            ifndef HALF_LJ
 977         c6grid_S2 = c6s_S2 * c6s_j_S;
 978         c6grid_S3 = c6s_S3 * c6s_j_S;
 979 #            endif
 980
 981 #            ifdef CHECK_EXCLS
 982         /* Recalculate rinvsix without exclusion mask (compiler might optimize) */
 983         rinvsix_nm_S0 = rinvsq_S0 * rinvsq_S0 * rinvsq_S0;
 984         rinvsix_nm_S1 = rinvsq_S1 * rinvsq_S1 * rinvsq_S1;
 985 #                ifndef HALF_LJ
 986         rinvsix_nm_S2 = rinvsq_S2 * rinvsq_S2 * rinvsq_S2;
 987         rinvsix_nm_S3 = rinvsq_S3 * rinvsq_S3 * rinvsq_S3;
 988 #                endif
 989 #            else
 990         /* We didn't use a mask, so we can copy */
 991         rinvsix_nm_S0 = rinvsix_S0;
 992         rinvsix_nm_S1 = rinvsix_S1;
 993 #                ifndef HALF_LJ
 994         rinvsix_nm_S2 = rinvsix_S2;
 995         rinvsix_nm_S3 = rinvsix_S3;
 996 #                endif
 997 #            endif
 998
 999         /* Mask for the cut-off to avoid overflow of cr2^2 */
1000         cr2_S0 = lje_c2_S * selectByMask(rsq_S0, wco_vdw_S0);
1001         cr2_S1 = lje_c2_S * selectByMask(rsq_S1, wco_vdw_S1);
1002 #            ifndef HALF_LJ
1003         cr2_S2 = lje_c2_S * selectByMask(rsq_S2, wco_vdw_S2);
1004         cr2_S3 = lje_c2_S * selectByMask(rsq_S3, wco_vdw_S3);
1005 #            endif
1006         // Unsafe version of our exp() should be fine, since these arguments should never
1007         // be smaller than -127 for any reasonable choice of cutoff or ewald coefficients.
1008         expmcr2_S0 = exp<MathOptimization::Unsafe>(-cr2_S0);
1009         expmcr2_S1 = exp<MathOptimization::Unsafe>(-cr2_S1);
1010 #            ifndef HALF_LJ
1011         expmcr2_S2 = exp<MathOptimization::Unsafe>(-cr2_S2);
1012         expmcr2_S3 = exp<MathOptimization::Unsafe>(-cr2_S3);
1013 #            endif
1014
1015         /* 1 + cr2 + 1/2*cr2^2 */
1016         poly_S0 = fma(fma(half_S, cr2_S0, one_S), cr2_S0, one_S);
1017         poly_S1 = fma(fma(half_S, cr2_S1, one_S), cr2_S1, one_S);
1018 #            ifndef HALF_LJ
1019         poly_S2 = fma(fma(half_S, cr2_S2, one_S), cr2_S2, one_S);
1020         poly_S3 = fma(fma(half_S, cr2_S3, one_S), cr2_S3, one_S);
1021 #            endif
1022
1023         /* We calculate LJ F*r = (6*C6)*(r^-6 - F_mesh/6), we use:
1024          * r^-6*cexp*(1 + cr2 + cr2^2/2 + cr2^3/6) = cexp*(r^-6*poly + c^6/6)
1025          */
1026         frLJ_S0 = fma(c6grid_S0,
1027                       fnma(expmcr2_S0, fma(rinvsix_nm_S0, poly_S0, lje_c6_6_S), rinvsix_nm_S0),
1028                       frLJ_S0);
1029         frLJ_S1 = fma(c6grid_S1,
1030                       fnma(expmcr2_S1, fma(rinvsix_nm_S1, poly_S1, lje_c6_6_S), rinvsix_nm_S1),
1031                       frLJ_S1);
1032 #            ifndef HALF_LJ
1033         frLJ_S2 = fma(c6grid_S2,
1034                       fnma(expmcr2_S2, fma(rinvsix_nm_S2, poly_S2, lje_c6_6_S), rinvsix_nm_S2),
1035                       frLJ_S2);
1036         frLJ_S3 = fma(c6grid_S3,
1037                       fnma(expmcr2_S3, fma(rinvsix_nm_S3, poly_S3, lje_c6_6_S), rinvsix_nm_S3),
1038                       frLJ_S3);
1039 #            endif
1040
1041 #            ifdef CALC_ENERGIES
1042 #                ifdef CHECK_EXCLS
1043         sh_mask_S0 = selectByMask(lje_vc_S, interact_S0);
1044         sh_mask_S1 = selectByMask(lje_vc_S, interact_S1);
1045 #                    ifndef HALF_LJ
1046         sh_mask_S2 = selectByMask(lje_vc_S, interact_S2);
1047         sh_mask_S3 = selectByMask(lje_vc_S, interact_S3);
1048 #                    endif
1049 #                else
1050         sh_mask_S0 = lje_vc_S;
1051         sh_mask_S1 = lje_vc_S;
1052 #                    ifndef HALF_LJ
1053         sh_mask_S2 = lje_vc_S;
1054         sh_mask_S3 = lje_vc_S;
1055 #                    endif
1056 #                endif
1057
1058         VLJ_S0 = fma(sixth_S * c6grid_S0,
1059                      fma(rinvsix_nm_S0, fnma(expmcr2_S0, poly_S0, one_S), sh_mask_S0),
1060                      VLJ_S0);
1061         VLJ_S1 = fma(sixth_S * c6grid_S1,
1062                      fma(rinvsix_nm_S1, fnma(expmcr2_S1, poly_S1, one_S), sh_mask_S1),
1063                      VLJ_S1);
1064 #                ifndef HALF_LJ
1065         VLJ_S2 = fma(sixth_S * c6grid_S2,
1066                      fma(rinvsix_nm_S2, fnma(expmcr2_S2, poly_S2, one_S), sh_mask_S2),
1067                      VLJ_S2);
1068         VLJ_S3 = fma(sixth_S * c6grid_S3,
1069                      fma(rinvsix_nm_S3, fnma(expmcr2_S3, poly_S3, one_S), sh_mask_S3),
1070                      VLJ_S3);
1071 #                endif
1072 #            endif /* CALC_ENERGIES */
1073     }
1074 #        endif /* LJ_EWALD_GEOM */
1075
1076 #        if defined VDW_CUTOFF_CHECK
1077     /* frLJ is multiplied later by rinvsq, which is masked for the Coulomb
1078      * cut-off, but if the VdW cut-off is shorter, we need to mask with that.
1079      */
1080     frLJ_S0 = selectByMask(frLJ_S0, wco_vdw_S0);
1081     frLJ_S1 = selectByMask(frLJ_S1, wco_vdw_S1);
1082 #            ifndef HALF_LJ
1083     frLJ_S2 = selectByMask(frLJ_S2, wco_vdw_S2);
1084     frLJ_S3 = selectByMask(frLJ_S3, wco_vdw_S3);
1085 #            endif
1086 #        endif
1087
1088 #        ifdef CALC_ENERGIES
1089     /* The potential shift should be removed for pairs beyond cut-off */
1090     VLJ_S0 = selectByMask(VLJ_S0, wco_vdw_S0);
1091     VLJ_S1 = selectByMask(VLJ_S1, wco_vdw_S1);
1092 #            ifndef HALF_LJ
1093     VLJ_S2 = selectByMask(VLJ_S2, wco_vdw_S2);
1094     VLJ_S3 = selectByMask(VLJ_S3, wco_vdw_S3);
1095 #            endif
1096 #        endif
1097
1098 #    endif /* CALC_LJ */
1099
1100 #    ifdef CALC_ENERGIES
1101 #        ifdef ENERGY_GROUPS
1102     /* Extract the group pair index per j pair.
1103      * Energy groups are stored per i-cluster, so things get
1104      * complicated when the i- and j-cluster size don't match.
1105      */
1106     {
1107         int egps_j;
1108 #            if UNROLLJ == 2
1109         egps_j    = nbatParams.energrp[cj >> 1];
1110         egp_jj[0] = ((egps_j >> ((cj & 1) * egps_jshift)) & egps_jmask) * egps_jstride;
1111 #            else
1112         /* We assume UNROLLI <= UNROLLJ */
1113         int jdi;
1114         for (jdi = 0; jdi < UNROLLJ / UNROLLI; jdi++)
1115         {
1116             int jj;
1117             egps_j = nbatParams.energrp[cj * (UNROLLJ / UNROLLI) + jdi];
1118             for (jj = 0; jj < (UNROLLI / 2); jj++)
1119             {
1120                 egp_jj[jdi * (UNROLLI / 2) + jj] =
1121                         ((egps_j >> (jj * egps_jshift)) & egps_jmask) * egps_jstride;
1122             }
1123         }
1124 #            endif
1125     }
1126 #        endif
1127
1128 #        ifdef CALC_COULOMB
1129 #            ifndef ENERGY_GROUPS
1130     vctot_S = vctot_S + vcoul_S0 + vcoul_S1 + vcoul_S2 + vcoul_S3;
1131 #            else
1132     add_ener_grp(vcoul_S0, vctp[0], egp_jj);
1133     add_ener_grp(vcoul_S1, vctp[1], egp_jj);
1134     add_ener_grp(vcoul_S2, vctp[2], egp_jj);
1135     add_ener_grp(vcoul_S3, vctp[3], egp_jj);
1136 #            endif
1137 #        endif
1138
1139 #        ifdef CALC_LJ
1140
1141 #            ifndef ENERGY_GROUPS
1142 #                ifndef HALF_LJ
1143     Vvdwtot_S = Vvdwtot_S + VLJ_S0 + VLJ_S1 + VLJ_S2 + VLJ_S3;
1144 #                else
1145     Vvdwtot_S = Vvdwtot_S + VLJ_S0 + VLJ_S1;
1146 #                endif
1147 #            else
1148     add_ener_grp(VLJ_S0, vvdwtp[0], egp_jj);
1149     add_ener_grp(VLJ_S1, vvdwtp[1], egp_jj);
1150 #                ifndef HALF_LJ
1151     add_ener_grp(VLJ_S2, vvdwtp[2], egp_jj);
1152     add_ener_grp(VLJ_S3, vvdwtp[3], egp_jj);
1153 #                endif
1154 #            endif
1155 #        endif /* CALC_LJ */
1156 #    endif     /* CALC_ENERGIES */
1157
1158 #    ifdef CALC_LJ
1159 #        ifdef CALC_COULOMB
1160     fscal_S0 = rinvsq_S0 * (frcoul_S0 + frLJ_S0);
1161 #        else
1162     fscal_S0 = rinvsq_S0 * frLJ_S0;
1163 #        endif
1164 #        ifdef CALC_COULOMB
1165     fscal_S1 = rinvsq_S1 * (frcoul_S1 + frLJ_S1);
1166 #        else
1167     fscal_S1 = rinvsq_S1 * frLJ_S1;
1168 #        endif
1169 #    else
1170     fscal_S0 = rinvsq_S0 * frcoul_S0;
1171     fscal_S1 = rinvsq_S1 * frcoul_S1;
1172 #    endif /* CALC_LJ */
1173 #    if defined CALC_LJ && !defined HALF_LJ
1174 #        ifdef CALC_COULOMB
1175     fscal_S2 = rinvsq_S2 * (frcoul_S2 + frLJ_S2);
1176     fscal_S3 = rinvsq_S3 * (frcoul_S3 + frLJ_S3);
1177 #        else
1178     fscal_S2 = rinvsq_S2 * frLJ_S2;
1179     fscal_S3 = rinvsq_S3 * frLJ_S3;
1180 #        endif
1181 #    else
1182     /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
1183     fscal_S2 = rinvsq_S2 * frcoul_S2;
1184     fscal_S3 = rinvsq_S3 * frcoul_S3;
1185 #    endif
1186
1187     /* Calculate temporary vectorial force */
1188     tx_S0 = fscal_S0 * dx_S0;
1189     tx_S1 = fscal_S1 * dx_S1;
1190     tx_S2 = fscal_S2 * dx_S2;
1191     tx_S3 = fscal_S3 * dx_S3;
1192     ty_S0 = fscal_S0 * dy_S0;
1193     ty_S1 = fscal_S1 * dy_S1;
1194     ty_S2 = fscal_S2 * dy_S2;
1195     ty_S3 = fscal_S3 * dy_S3;
1196     tz_S0 = fscal_S0 * dz_S0;
1197     tz_S1 = fscal_S1 * dz_S1;
1198     tz_S2 = fscal_S2 * dz_S2;
1199     tz_S3 = fscal_S3 * dz_S3;
1200
1201     /* Increment i atom force */
1202     fix_S0 = fix_S0 + tx_S0;
1203     fix_S1 = fix_S1 + tx_S1;
1204     fix_S2 = fix_S2 + tx_S2;
1205     fix_S3 = fix_S3 + tx_S3;
1206     fiy_S0 = fiy_S0 + ty_S0;
1207     fiy_S1 = fiy_S1 + ty_S1;
1208     fiy_S2 = fiy_S2 + ty_S2;
1209     fiy_S3 = fiy_S3 + ty_S3;
1210     fiz_S0 = fiz_S0 + tz_S0;
1211     fiz_S1 = fiz_S1 + tz_S1;
1212     fiz_S2 = fiz_S2 + tz_S2;
1213     fiz_S3 = fiz_S3 + tz_S3;
1214
1215     /* Decrement j atom force */
1216     store(f + ajx, load<SimdReal>(f + ajx) - (tx_S0 + tx_S1 + tx_S2 + tx_S3));
1217     store(f + ajy, load<SimdReal>(f + ajy) - (ty_S0 + ty_S1 + ty_S2 + ty_S3));
1218     store(f + ajz, load<SimdReal>(f + ajz) - (tz_S0 + tz_S1 + tz_S2 + tz_S3));
1219 }
1220
1221 #    undef rinv_ex_S0
1222 #    undef rinv_ex_S1
1223 #    undef rinv_ex_S2
1224 #    undef rinv_ex_S3
1225
1226 #    undef wco_vdw_S0
1227 #    undef wco_vdw_S1
1228 #    undef wco_vdw_S2
1229 #    undef wco_vdw_S3
1230
1231 #    undef EXCL_FORCES
1232
1233 #endif // !DOXYGEN