src/gromacs/nbnxm/kernels_simd_4xm/kernel_inner.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /* Doxygen gets confused (buggy) about the block in this file in combination with
  37  * the  namespace prefix, and thinks store is documented here.
  38  * This will solve itself with the second-generation nbnxn kernels, so for now
  39  * we just tell Doxygen to stay out.
  40  */
  41 #ifndef DOXYGEN
  42
  43 /* This is the innermost loop contents for the 4 x N atom simd kernel.
  44  * This flavor of the kernel calculates interactions of 4 i-atoms
  45  * with N j-atoms stored in N wide simd registers.
  46  */
  47
  48 /* When calculating RF or Ewald interactions we calculate the electrostatic/LJ
  49  * forces on excluded atom pairs here in the non-bonded loops.
  50  * But when energies and/or virial is required we calculate them
  51  * separately to as then it is easier to separate the energy and virial
  52  * contributions.
  53  */
  54 #    if defined CHECK_EXCLS && (defined CALC_COULOMB || defined LJ_EWALD_GEOM)
  55 #        define EXCL_FORCES
  56 #    endif
  57
  58 {
  59     int cj, ajx, ajy, ajz;
  60     int gmx_unused aj;
  61
  62 #    ifdef ENERGY_GROUPS
  63     /* Energy group indices for two atoms packed into one int */
  64     int egp_jj[UNROLLJ / 2];
  65 #    endif
  66
  67 #    ifdef CHECK_EXCLS
  68     /* Interaction (non-exclusion) mask of all 1's or 0's */
  69     SimdBool interact_S0;
  70     SimdBool interact_S1;
  71     SimdBool interact_S2;
  72     SimdBool interact_S3;
  73 #    endif
  74
  75     SimdReal jx_S, jy_S, jz_S;
  76     SimdReal dx_S0, dy_S0, dz_S0;
  77     SimdReal dx_S1, dy_S1, dz_S1;
  78     SimdReal dx_S2, dy_S2, dz_S2;
  79     SimdReal dx_S3, dy_S3, dz_S3;
  80     SimdReal tx_S0, ty_S0, tz_S0;
  81     SimdReal tx_S1, ty_S1, tz_S1;
  82     SimdReal tx_S2, ty_S2, tz_S2;
  83     SimdReal tx_S3, ty_S3, tz_S3;
  84     SimdReal rsq_S0, rinv_S0, rinvsq_S0;
  85     SimdReal rsq_S1, rinv_S1, rinvsq_S1;
  86     SimdReal rsq_S2, rinv_S2, rinvsq_S2;
  87     SimdReal rsq_S3, rinv_S3, rinvsq_S3;
  88
  89     /* wco: within cut-off, mask of all 1's or 0's */
  90     SimdBool wco_S0;
  91     SimdBool wco_S1;
  92     SimdBool wco_S2;
  93     SimdBool wco_S3;
  94
  95 #    ifdef VDW_CUTOFF_CHECK
  96     SimdBool wco_vdw_S0;
  97     SimdBool wco_vdw_S1;
  98 #        ifndef HALF_LJ
  99     SimdBool wco_vdw_S2;
 100     SimdBool wco_vdw_S3;
 101 #        endif
 102 #    endif
 103
 104 #    if (defined CALC_COULOMB && defined CALC_COUL_TAB) || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
 105     SimdReal                                                                                  r_S0;
 106     SimdReal                                                                                  r_S1;
 107 #        if (defined CALC_COULOMB && defined CALC_COUL_TAB) || !defined HALF_LJ
 108     SimdReal r_S2;
 109     SimdReal r_S3;
 110 #        endif
 111 #    endif
 112
 113 #    if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
 114     SimdReal                               rsw_S0, rsw2_S0;
 115     SimdReal                               rsw_S1, rsw2_S1;
 116 #        ifndef HALF_LJ
 117     SimdReal rsw_S2, rsw2_S2;
 118     SimdReal rsw_S3, rsw2_S3;
 119 #        endif
 120 #    endif
 121
 122 #    ifdef CALC_COULOMB
 123 #        ifdef CHECK_EXCLS
 124     /* 1/r masked with the interaction mask */
 125     SimdReal rinv_ex_S0;
 126     SimdReal rinv_ex_S1;
 127     SimdReal rinv_ex_S2;
 128     SimdReal rinv_ex_S3;
 129 #        endif
 130     SimdReal jq_S;
 131     SimdReal qq_S0;
 132     SimdReal qq_S1;
 133     SimdReal qq_S2;
 134     SimdReal qq_S3;
 135 #        ifdef CALC_COUL_TAB
 136     /* The force (PME mesh force) we need to subtract from 1/r^2 */
 137     SimdReal fsub_S0;
 138     SimdReal fsub_S1;
 139     SimdReal fsub_S2;
 140     SimdReal fsub_S3;
 141 #        endif
 142 #        ifdef CALC_COUL_EWALD
 143     SimdReal brsq_S0, brsq_S1, brsq_S2, brsq_S3;
 144     SimdReal ewcorr_S0, ewcorr_S1, ewcorr_S2, ewcorr_S3;
 145 #        endif
 146
 147     /* frcoul = (1/r - fsub)*r */
 148     SimdReal frcoul_S0;
 149     SimdReal frcoul_S1;
 150     SimdReal frcoul_S2;
 151     SimdReal frcoul_S3;
 152 #        ifdef CALC_COUL_TAB
 153     /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
 154     SimdReal rs_S0, rf_S0, frac_S0;
 155     SimdReal rs_S1, rf_S1, frac_S1;
 156     SimdReal rs_S2, rf_S2, frac_S2;
 157     SimdReal rs_S3, rf_S3, frac_S3;
 158     /* Table index: rs truncated to an int */
 159     SimdInt32 ti_S0, ti_S1, ti_S2, ti_S3;
 160     /* Linear force table values */
 161     SimdReal ctab0_S0, ctab1_S0;
 162     SimdReal ctab0_S1, ctab1_S1;
 163     SimdReal ctab0_S2, ctab1_S2;
 164     SimdReal ctab0_S3, ctab1_S3;
 165 #            ifdef CALC_ENERGIES
 166     /* Quadratic energy table value */
 167     SimdReal ctabv_S0, dum_S0;
 168     SimdReal ctabv_S1, dum_S1;
 169     SimdReal ctabv_S2, dum_S2;
 170     SimdReal ctabv_S3, dum_S3;
 171 #            endif
 172 #        endif
 173 #        if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 174     /* The potential (PME mesh) we need to subtract from 1/r */
 175     SimdReal vc_sub_S0;
 176     SimdReal vc_sub_S1;
 177     SimdReal vc_sub_S2;
 178     SimdReal vc_sub_S3;
 179 #        endif
 180 #        ifdef CALC_ENERGIES
 181     /* Electrostatic potential */
 182     SimdReal vcoul_S0;
 183     SimdReal vcoul_S1;
 184     SimdReal vcoul_S2;
 185     SimdReal vcoul_S3;
 186 #        endif
 187 #    endif
 188     /* The force times 1/r */
 189     SimdReal fscal_S0;
 190     SimdReal fscal_S1;
 191     SimdReal fscal_S2;
 192     SimdReal fscal_S3;
 193
 194 #    ifdef CALC_LJ
 195 #        ifdef LJ_COMB_LB
 196     /* LJ sigma_j/2 and sqrt(epsilon_j) */
 197     SimdReal hsig_j_S, seps_j_S;
 198     /* LJ sigma_ij and epsilon_ij */
 199     SimdReal sig_S0, eps_S0;
 200     SimdReal sig_S1, eps_S1;
 201 #            ifndef HALF_LJ
 202     SimdReal sig_S2, eps_S2;
 203     SimdReal sig_S3, eps_S3;
 204 #            endif
 205 #            ifdef CALC_ENERGIES
 206     SimdReal sig2_S0, sig6_S0;
 207     SimdReal sig2_S1, sig6_S1;
 208 #                ifndef HALF_LJ
 209     SimdReal sig2_S2, sig6_S2;
 210     SimdReal sig2_S3, sig6_S3;
 211 #                endif
 212 #            endif /* LJ_COMB_LB */
 213 #        endif     /* CALC_LJ */
 214
 215 #        ifdef LJ_COMB_GEOM
 216     SimdReal c6s_j_S, c12s_j_S;
 217 #        endif
 218
 219 #        if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM
 220     /* Index for loading LJ parameters, complicated when interleaving */
 221     int aj2;
 222 #        endif
 223
 224     /* Intermediate variables for LJ calculation */
 225 #        ifndef LJ_COMB_LB
 226     SimdReal rinvsix_S0;
 227     SimdReal rinvsix_S1;
 228 #            ifndef HALF_LJ
 229     SimdReal rinvsix_S2;
 230     SimdReal rinvsix_S3;
 231 #            endif
 232 #        endif
 233 #        ifdef LJ_COMB_LB
 234     SimdReal sir_S0, sir2_S0, sir6_S0;
 235     SimdReal sir_S1, sir2_S1, sir6_S1;
 236 #            ifndef HALF_LJ
 237     SimdReal sir_S2, sir2_S2, sir6_S2;
 238     SimdReal sir_S3, sir2_S3, sir6_S3;
 239 #            endif
 240 #        endif
 241
 242     SimdReal FrLJ6_S0, FrLJ12_S0, frLJ_S0;
 243     SimdReal FrLJ6_S1, FrLJ12_S1, frLJ_S1;
 244 #        ifndef HALF_LJ
 245     SimdReal FrLJ6_S2, FrLJ12_S2, frLJ_S2;
 246     SimdReal FrLJ6_S3, FrLJ12_S3, frLJ_S3;
 247 #        endif
 248 #    endif /* CALC_LJ */
 249
 250     /* j-cluster index */
 251     cj = l_cj[cjind].cj;
 252
 253     /* Atom indices (of the first atom in the cluster) */
 254     aj = cj * UNROLLJ;
 255 #    if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM)
 256 #        if UNROLLJ == STRIDE
 257     aj2 = aj * 2;
 258 #        else
 259     aj2 = (cj >> 1) * 2 * STRIDE + (cj & 1) * UNROLLJ;
 260 #        endif
 261 #    endif
 262 #    if UNROLLJ == STRIDE
 263     ajx = aj * DIM;
 264 #    else
 265     ajx = (cj >> 1) * DIM * STRIDE + (cj & 1) * UNROLLJ;
 266 #    endif
 267     ajy = ajx + STRIDE;
 268     ajz = ajy + STRIDE;
 269
 270 #    ifdef CHECK_EXCLS
 271     gmx_load_simd_4xn_interactions(static_cast<int>(l_cj[cjind].excl), filter_S0, filter_S1,
 272                                    filter_S2, filter_S3, nbat->simdMasks.interaction_array.data(),
 273                                    &interact_S0, &interact_S1, &interact_S2, &interact_S3);
 274 #    endif /* CHECK_EXCLS */
 275
 276     /* load j atom coordinates */
 277     jx_S = load<SimdReal>(x + ajx);
 278     jy_S = load<SimdReal>(x + ajy);
 279     jz_S = load<SimdReal>(x + ajz);
 280
 281     /* Calculate distance */
 282     dx_S0 = ix_S0 - jx_S;
 283     dy_S0 = iy_S0 - jy_S;
 284     dz_S0 = iz_S0 - jz_S;
 285     dx_S1 = ix_S1 - jx_S;
 286     dy_S1 = iy_S1 - jy_S;
 287     dz_S1 = iz_S1 - jz_S;
 288     dx_S2 = ix_S2 - jx_S;
 289     dy_S2 = iy_S2 - jy_S;
 290     dz_S2 = iz_S2 - jz_S;
 291     dx_S3 = ix_S3 - jx_S;
 292     dy_S3 = iy_S3 - jy_S;
 293     dz_S3 = iz_S3 - jz_S;
 294
 295     /* rsq = dx*dx+dy*dy+dz*dz */
 296     rsq_S0 = norm2(dx_S0, dy_S0, dz_S0);
 297     rsq_S1 = norm2(dx_S1, dy_S1, dz_S1);
 298     rsq_S2 = norm2(dx_S2, dy_S2, dz_S2);
 299     rsq_S3 = norm2(dx_S3, dy_S3, dz_S3);
 300
 301     /* Do the cut-off check */
 302     wco_S0 = (rsq_S0 < rc2_S);
 303     wco_S1 = (rsq_S1 < rc2_S);
 304     wco_S2 = (rsq_S2 < rc2_S);
 305     wco_S3 = (rsq_S3 < rc2_S);
 306
 307 #    ifdef CHECK_EXCLS
 308 #        ifdef EXCL_FORCES
 309     /* Only remove the (sub-)diagonal to avoid double counting */
 310 #            if UNROLLJ == UNROLLI
 311     if (cj == ci_sh)
 312     {
 313         wco_S0 = wco_S0 && diagonal_mask_S0;
 314         wco_S1 = wco_S1 && diagonal_mask_S1;
 315         wco_S2 = wco_S2 && diagonal_mask_S2;
 316         wco_S3 = wco_S3 && diagonal_mask_S3;
 317     }
 318 #            else
 319 #                if UNROLLJ < UNROLLI
 320     if (cj == ci_sh * 2)
 321     {
 322         wco_S0 = wco_S0 && diagonal_mask0_S0;
 323         wco_S1 = wco_S1 && diagonal_mask0_S1;
 324         wco_S2 = wco_S2 && diagonal_mask0_S2;
 325         wco_S3 = wco_S3 && diagonal_mask0_S3;
 326     }
 327     if (cj == ci_sh * 2 + 1)
 328     {
 329         wco_S0 = wco_S0 && diagonal_mask1_S0;
 330         wco_S1 = wco_S1 && diagonal_mask1_S1;
 331         wco_S2 = wco_S2 && diagonal_mask1_S2;
 332         wco_S3 = wco_S3 && diagonal_mask1_S3;
 333     }
 334 #                else
 335     if (cj * 2 == ci_sh)
 336     {
 337         wco_S0 = wco_S0 && diagonal_mask0_S0;
 338         wco_S1 = wco_S1 && diagonal_mask0_S1;
 339         wco_S2 = wco_S2 && diagonal_mask0_S2;
 340         wco_S3 = wco_S3 && diagonal_mask0_S3;
 341     }
 342     else if (cj * 2 + 1 == ci_sh)
 343     {
 344         wco_S0 = wco_S0 && diagonal_mask1_S0;
 345         wco_S1 = wco_S1 && diagonal_mask1_S1;
 346         wco_S2 = wco_S2 && diagonal_mask1_S2;
 347         wco_S3 = wco_S3 && diagonal_mask1_S3;
 348     }
 349 #                endif
 350 #            endif
 351 #        else /* EXCL_FORCES */
 352     /* No exclusion forces: remove all excluded atom pairs from the list */
 353     wco_S0 = wco_S0 && interact_S0;
 354     wco_S1 = wco_S1 && interact_S1;
 355     wco_S2 = wco_S2 && interact_S2;
 356     wco_S3 = wco_S3 && interact_S3;
 357 #        endif
 358 #    endif
 359
 360 #    ifdef COUNT_PAIRS
 361     {
 362         int                              i, j;
 363         alignas(GMX_SIMD_ALIGNMENT) real tmp[2 * GMX_SIMD_REAL_WIDTH];
 364
 365         for (i = 0; i < UNROLLI; i++)
 366         {
 367             store(tmp, rc2_S - (i == 0 ? rsq_S0 : (i == 1 ? rsq_S1 : (i == 2 ? rsq_S2 : rsq_S3))));
 368             for (j = 0; j < UNROLLJ; j++)
 369             {
 370                 if (tmp[j] >= 0)
 371                 {
 372                     npair++;
 373                 }
 374             }
 375         }
 376     }
 377 #    endif
 378
 379     // Ensure the distances do not fall below the limit where r^-12 overflows.
 380     // This should never happen for normal interactions.
 381     rsq_S0 = max(rsq_S0, minRsq_S);
 382     rsq_S1 = max(rsq_S1, minRsq_S);
 383     rsq_S2 = max(rsq_S2, minRsq_S);
 384     rsq_S3 = max(rsq_S3, minRsq_S);
 385
 386     /* Calculate 1/r */
 387 #    if !GMX_DOUBLE
 388     rinv_S0 = invsqrt(rsq_S0);
 389     rinv_S1 = invsqrt(rsq_S1);
 390     rinv_S2 = invsqrt(rsq_S2);
 391     rinv_S3 = invsqrt(rsq_S3);
 392 #    else
 393     invsqrtPair(rsq_S0, rsq_S1, &rinv_S0, &rinv_S1);
 394     invsqrtPair(rsq_S2, rsq_S3, &rinv_S2, &rinv_S3);
 395 #    endif
 396
 397 #    ifdef CALC_COULOMB
 398     /* Load parameters for j atom */
 399     jq_S  = load<SimdReal>(q + aj);
 400     qq_S0 = iq_S0 * jq_S;
 401     qq_S1 = iq_S1 * jq_S;
 402     qq_S2 = iq_S2 * jq_S;
 403     qq_S3 = iq_S3 * jq_S;
 404 #    endif
 405
 406 #    ifdef CALC_LJ
 407 #        if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
 408     SimdReal c6_S0, c6_S1, c12_S0, c12_S1;
 409     gatherLoadTranspose<c_simdBestPairAlignment>(nbfp0, type + aj, &c6_S0, &c12_S0);
 410     gatherLoadTranspose<c_simdBestPairAlignment>(nbfp1, type + aj, &c6_S1, &c12_S1);
 411 #            ifndef HALF_LJ
 412     SimdReal c6_S2, c6_S3, c12_S2, c12_S3;
 413     gatherLoadTranspose<c_simdBestPairAlignment>(nbfp2, type + aj, &c6_S2, &c12_S2);
 414     gatherLoadTranspose<c_simdBestPairAlignment>(nbfp3, type + aj, &c6_S3, &c12_S3);
 415 #            endif
 416 #        endif /* not defined any LJ rule */
 417
 418 #        ifdef LJ_COMB_GEOM
 419     c6s_j_S        = load<SimdReal>(ljc + aj2 + 0);
 420     c12s_j_S       = load<SimdReal>(ljc + aj2 + STRIDE);
 421     SimdReal c6_S0 = c6s_S0 * c6s_j_S;
 422     SimdReal c6_S1 = c6s_S1 * c6s_j_S;
 423 #            ifndef HALF_LJ
 424     SimdReal c6_S2 = c6s_S2 * c6s_j_S;
 425     SimdReal c6_S3 = c6s_S3 * c6s_j_S;
 426 #            endif
 427     SimdReal c12_S0 = c12s_S0 * c12s_j_S;
 428     SimdReal c12_S1 = c12s_S1 * c12s_j_S;
 429 #            ifndef HALF_LJ
 430     SimdReal c12_S2 = c12s_S2 * c12s_j_S;
 431     SimdReal c12_S3 = c12s_S3 * c12s_j_S;
 432 #            endif
 433 #        endif /* LJ_COMB_GEOM */
 434
 435 #        ifdef LJ_COMB_LB
 436     hsig_j_S = load<SimdReal>(ljc + aj2 + 0);
 437     seps_j_S = load<SimdReal>(ljc + aj2 + STRIDE);
 438
 439     sig_S0 = hsig_i_S0 + hsig_j_S;
 440     sig_S1 = hsig_i_S1 + hsig_j_S;
 441     eps_S0 = seps_i_S0 * seps_j_S;
 442     eps_S1 = seps_i_S1 * seps_j_S;
 443 #            ifndef HALF_LJ
 444     sig_S2 = hsig_i_S2 + hsig_j_S;
 445     sig_S3 = hsig_i_S3 + hsig_j_S;
 446     eps_S2 = seps_i_S2 * seps_j_S;
 447     eps_S3 = seps_i_S3 * seps_j_S;
 448 #            endif
 449 #        endif /* LJ_COMB_LB */
 450
 451 #    endif /* CALC_LJ */
 452
 453     /* Set rinv to zero for r beyond the cut-off */
 454     rinv_S0 = selectByMask(rinv_S0, wco_S0);
 455     rinv_S1 = selectByMask(rinv_S1, wco_S1);
 456     rinv_S2 = selectByMask(rinv_S2, wco_S2);
 457     rinv_S3 = selectByMask(rinv_S3, wco_S3);
 458
 459     rinvsq_S0 = rinv_S0 * rinv_S0;
 460     rinvsq_S1 = rinv_S1 * rinv_S1;
 461     rinvsq_S2 = rinv_S2 * rinv_S2;
 462     rinvsq_S3 = rinv_S3 * rinv_S3;
 463
 464 #    ifdef CALC_COULOMB
 465     /* Note that here we calculate force*r, not the usual force/r.
 466      * This allows avoiding masking the reaction-field contribution,
 467      * as frcoul is later multiplied by rinvsq which has been
 468      * masked with the cut-off check.
 469      */
 470
 471 #        ifdef EXCL_FORCES
 472     /* Only add 1/r for non-excluded atom pairs */
 473     rinv_ex_S0 = selectByMask(rinv_S0, interact_S0);
 474     rinv_ex_S1 = selectByMask(rinv_S1, interact_S1);
 475     rinv_ex_S2 = selectByMask(rinv_S2, interact_S2);
 476     rinv_ex_S3 = selectByMask(rinv_S3, interact_S3);
 477 #        else
 478     /* No exclusion forces, we always need 1/r */
 479 #            define rinv_ex_S0 rinv_S0
 480 #            define rinv_ex_S1 rinv_S1
 481 #            define rinv_ex_S2 rinv_S2
 482 #            define rinv_ex_S3 rinv_S3
 483 #        endif
 484
 485 #        ifdef CALC_COUL_RF
 486     /* Electrostatic interactions */
 487     frcoul_S0 = qq_S0 * fma(rsq_S0, mrc_3_S, rinv_ex_S0);
 488     frcoul_S1 = qq_S1 * fma(rsq_S1, mrc_3_S, rinv_ex_S1);
 489     frcoul_S2 = qq_S2 * fma(rsq_S2, mrc_3_S, rinv_ex_S2);
 490     frcoul_S3 = qq_S3 * fma(rsq_S3, mrc_3_S, rinv_ex_S3);
 491
 492 #            ifdef CALC_ENERGIES
 493     vcoul_S0 = qq_S0 * (rinv_ex_S0 + fma(rsq_S0, hrc_3_S, moh_rc_S));
 494     vcoul_S1 = qq_S1 * (rinv_ex_S1 + fma(rsq_S1, hrc_3_S, moh_rc_S));
 495     vcoul_S2 = qq_S2 * (rinv_ex_S2 + fma(rsq_S2, hrc_3_S, moh_rc_S));
 496     vcoul_S3 = qq_S3 * (rinv_ex_S3 + fma(rsq_S3, hrc_3_S, moh_rc_S));
 497 #            endif
 498 #        endif
 499
 500 #        ifdef CALC_COUL_EWALD
 501     /* We need to mask (or limit) rsq for the cut-off,
 502      * as large distances can cause an overflow in gmx_pmecorrF/V.
 503      */
 504     brsq_S0   = beta2_S * selectByMask(rsq_S0, wco_S0);
 505     brsq_S1   = beta2_S * selectByMask(rsq_S1, wco_S1);
 506     brsq_S2   = beta2_S * selectByMask(rsq_S2, wco_S2);
 507     brsq_S3   = beta2_S * selectByMask(rsq_S3, wco_S3);
 508     ewcorr_S0 = beta_S * pmeForceCorrection(brsq_S0);
 509     ewcorr_S1 = beta_S * pmeForceCorrection(brsq_S1);
 510     ewcorr_S2 = beta_S * pmeForceCorrection(brsq_S2);
 511     ewcorr_S3 = beta_S * pmeForceCorrection(brsq_S3);
 512     frcoul_S0 = qq_S0 * fma(ewcorr_S0, brsq_S0, rinv_ex_S0);
 513     frcoul_S1 = qq_S1 * fma(ewcorr_S1, brsq_S1, rinv_ex_S1);
 514     frcoul_S2 = qq_S2 * fma(ewcorr_S2, brsq_S2, rinv_ex_S2);
 515     frcoul_S3 = qq_S3 * fma(ewcorr_S3, brsq_S3, rinv_ex_S3);
 516
 517 #            ifdef CALC_ENERGIES
 518     vc_sub_S0 = beta_S * pmePotentialCorrection(brsq_S0);
 519     vc_sub_S1 = beta_S * pmePotentialCorrection(brsq_S1);
 520     vc_sub_S2 = beta_S * pmePotentialCorrection(brsq_S2);
 521     vc_sub_S3 = beta_S * pmePotentialCorrection(brsq_S3);
 522 #            endif
 523
 524 #        endif /* CALC_COUL_EWALD */
 525
 526 #        ifdef CALC_COUL_TAB
 527     /* Electrostatic interactions */
 528     r_S0 = rsq_S0 * rinv_S0;
 529     r_S1 = rsq_S1 * rinv_S1;
 530     r_S2 = rsq_S2 * rinv_S2;
 531     r_S3 = rsq_S3 * rinv_S3;
 532     /* Convert r to scaled table units */
 533     rs_S0 = r_S0 * invtsp_S;
 534     rs_S1 = r_S1 * invtsp_S;
 535     rs_S2 = r_S2 * invtsp_S;
 536     rs_S3 = r_S3 * invtsp_S;
 537     /* Truncate scaled r to an int */
 538     ti_S0 = cvttR2I(rs_S0);
 539     ti_S1 = cvttR2I(rs_S1);
 540     ti_S2 = cvttR2I(rs_S2);
 541     ti_S3 = cvttR2I(rs_S3);
 542
 543     rf_S0 = trunc(rs_S0);
 544     rf_S1 = trunc(rs_S1);
 545     rf_S2 = trunc(rs_S2);
 546     rf_S3 = trunc(rs_S3);
 547
 548     frac_S0 = rs_S0 - rf_S0;
 549     frac_S1 = rs_S1 - rf_S1;
 550     frac_S2 = rs_S2 - rf_S2;
 551     frac_S3 = rs_S3 - rf_S3;
 552
 553     /* Load and interpolate table forces and possibly energies.
 554      * Force and energy can be combined in one table, stride 4: FDV0
 555      * or in two separate tables with stride 1: F and V
 556      * Currently single precision uses FDV0, double F and V.
 557      */
 558 #            ifndef CALC_ENERGIES
 559 #                ifdef TAB_FDV0
 560     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
 561     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S1, &ctab0_S1, &ctab1_S1);
 562     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
 563     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S3, &ctab0_S3, &ctab1_S3);
 564 #                else
 565     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
 566     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S1, &ctab0_S1, &ctab1_S1);
 567     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
 568     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S3, &ctab0_S3, &ctab1_S3);
 569     ctab1_S0  = ctab1_S0 - ctab0_S0;
 570     ctab1_S1  = ctab1_S1 - ctab0_S1;
 571     ctab1_S2  = ctab1_S2 - ctab0_S2;
 572     ctab1_S3  = ctab1_S3 - ctab0_S3;
 573 #                endif
 574 #            else
 575 #                ifdef TAB_FDV0
 576     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0, &ctabv_S0, &dum_S0);
 577     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S1, &ctab0_S1, &ctab1_S1, &ctabv_S1, &dum_S1);
 578     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2, &ctabv_S2, &dum_S2);
 579     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S3, &ctab0_S3, &ctab1_S3, &ctabv_S3, &dum_S3);
 580 #                else
 581     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
 582     gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S0, &ctabv_S0, &dum_S0);
 583     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S1, &ctab0_S1, &ctab1_S1);
 584     gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S1, &ctabv_S1, &dum_S1);
 585     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
 586     gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S2, &ctabv_S2, &dum_S2);
 587     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S3, &ctab0_S3, &ctab1_S3);
 588     gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S3, &ctabv_S3, &dum_S3);
 589     ctab1_S0 = ctab1_S0 - ctab0_S0;
 590     ctab1_S1 = ctab1_S1 - ctab0_S1;
 591     ctab1_S2 = ctab1_S2 - ctab0_S2;
 592     ctab1_S3 = ctab1_S3 - ctab0_S3;
 593 #                endif
 594 #            endif
 595     fsub_S0   = fma(frac_S0, ctab1_S0, ctab0_S0);
 596     fsub_S1   = fma(frac_S1, ctab1_S1, ctab0_S1);
 597     fsub_S2   = fma(frac_S2, ctab1_S2, ctab0_S2);
 598     fsub_S3   = fma(frac_S3, ctab1_S3, ctab0_S3);
 599     frcoul_S0 = qq_S0 * fnma(fsub_S0, r_S0, rinv_ex_S0);
 600     frcoul_S1 = qq_S1 * fnma(fsub_S1, r_S1, rinv_ex_S1);
 601     frcoul_S2 = qq_S2 * fnma(fsub_S2, r_S2, rinv_ex_S2);
 602     frcoul_S3 = qq_S3 * fnma(fsub_S3, r_S3, rinv_ex_S3);
 603
 604 #            ifdef CALC_ENERGIES
 605     vc_sub_S0 = fma((mhalfsp_S * frac_S0), (ctab0_S0 + fsub_S0), ctabv_S0);
 606     vc_sub_S1 = fma((mhalfsp_S * frac_S1), (ctab0_S1 + fsub_S1), ctabv_S1);
 607     vc_sub_S2 = fma((mhalfsp_S * frac_S2), (ctab0_S2 + fsub_S2), ctabv_S2);
 608     vc_sub_S3 = fma((mhalfsp_S * frac_S3), (ctab0_S3 + fsub_S3), ctabv_S3);
 609 #            endif
 610 #        endif /* CALC_COUL_TAB */
 611
 612 #        if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 613 #            ifndef NO_SHIFT_EWALD
 614     /* Add Ewald potential shift to vc_sub for convenience */
 615 #                ifdef CHECK_EXCLS
 616     vc_sub_S0 = vc_sub_S0 + selectByMask(sh_ewald_S, interact_S0);
 617     vc_sub_S1 = vc_sub_S1 + selectByMask(sh_ewald_S, interact_S1);
 618     vc_sub_S2 = vc_sub_S2 + selectByMask(sh_ewald_S, interact_S2);
 619     vc_sub_S3 = vc_sub_S3 + selectByMask(sh_ewald_S, interact_S3);
 620 #                else
 621     vc_sub_S0 = vc_sub_S0 + sh_ewald_S;
 622     vc_sub_S1 = vc_sub_S1 + sh_ewald_S;
 623     vc_sub_S2 = vc_sub_S2 + sh_ewald_S;
 624     vc_sub_S3 = vc_sub_S3 + sh_ewald_S;
 625 #                endif
 626 #            endif
 627
 628     vcoul_S0 = qq_S0 * (rinv_ex_S0 - vc_sub_S0);
 629     vcoul_S1 = qq_S1 * (rinv_ex_S1 - vc_sub_S1);
 630     vcoul_S2 = qq_S2 * (rinv_ex_S2 - vc_sub_S2);
 631     vcoul_S3 = qq_S3 * (rinv_ex_S3 - vc_sub_S3);
 632
 633 #        endif
 634
 635 #        ifdef CALC_ENERGIES
 636     /* Mask energy for cut-off and diagonal */
 637     vcoul_S0 = selectByMask(vcoul_S0, wco_S0);
 638     vcoul_S1 = selectByMask(vcoul_S1, wco_S1);
 639     vcoul_S2 = selectByMask(vcoul_S2, wco_S2);
 640     vcoul_S3 = selectByMask(vcoul_S3, wco_S3);
 641 #        endif
 642
 643 #    endif /* CALC_COULOMB */
 644
 645 #    ifdef CALC_LJ
 646     /* Lennard-Jones interaction */
 647
 648 #        ifdef VDW_CUTOFF_CHECK
 649     wco_vdw_S0 = (rsq_S0 < rcvdw2_S);
 650     wco_vdw_S1 = (rsq_S1 < rcvdw2_S);
 651 #            ifndef HALF_LJ
 652     wco_vdw_S2 = (rsq_S2 < rcvdw2_S);
 653     wco_vdw_S3 = (rsq_S3 < rcvdw2_S);
 654 #            endif
 655 #        else
 656     /* Same cut-off for Coulomb and VdW, reuse the registers */
 657 #            define wco_vdw_S0 wco_S0
 658 #            define wco_vdw_S1 wco_S1
 659 #            define wco_vdw_S2 wco_S2
 660 #            define wco_vdw_S3 wco_S3
 661 #        endif
 662
 663 #        ifndef LJ_COMB_LB
 664     rinvsix_S0 = rinvsq_S0 * rinvsq_S0 * rinvsq_S0;
 665     rinvsix_S1 = rinvsq_S1 * rinvsq_S1 * rinvsq_S1;
 666 #            ifdef EXCL_FORCES
 667     rinvsix_S0 = selectByMask(rinvsix_S0, interact_S0);
 668     rinvsix_S1 = selectByMask(rinvsix_S1, interact_S1);
 669 #            endif
 670 #            ifndef HALF_LJ
 671     rinvsix_S2 = rinvsq_S2 * rinvsq_S2 * rinvsq_S2;
 672     rinvsix_S3 = rinvsq_S3 * rinvsq_S3 * rinvsq_S3;
 673 #                ifdef EXCL_FORCES
 674     rinvsix_S2 = selectByMask(rinvsix_S2, interact_S2);
 675     rinvsix_S3 = selectByMask(rinvsix_S3, interact_S3);
 676 #                endif
 677 #            endif
 678
 679 #            if defined LJ_CUT || defined LJ_POT_SWITCH
 680     /* We have plain LJ or LJ-PME with simple C6/6 C12/12 coefficients */
 681     FrLJ6_S0 = c6_S0 * rinvsix_S0;
 682     FrLJ6_S1 = c6_S1 * rinvsix_S1;
 683 #                ifndef HALF_LJ
 684     FrLJ6_S2 = c6_S2 * rinvsix_S2;
 685     FrLJ6_S3 = c6_S3 * rinvsix_S3;
 686 #                endif
 687     FrLJ12_S0 = c12_S0 * rinvsix_S0 * rinvsix_S0;
 688     FrLJ12_S1 = c12_S1 * rinvsix_S1 * rinvsix_S1;
 689 #                ifndef HALF_LJ
 690     FrLJ12_S2 = c12_S2 * rinvsix_S2 * rinvsix_S2;
 691     FrLJ12_S3 = c12_S3 * rinvsix_S3 * rinvsix_S3;
 692 #                endif
 693 #            endif
 694
 695 #            if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
 696     /* We switch the LJ force */
 697     r_S0    = rsq_S0 * rinv_S0;
 698     rsw_S0  = max(r_S0 - rswitch_S, zero_S);
 699     rsw2_S0 = rsw_S0 * rsw_S0;
 700     r_S1    = rsq_S1 * rinv_S1;
 701     rsw_S1  = max(r_S1 - rswitch_S, zero_S);
 702     rsw2_S1 = rsw_S1 * rsw_S1;
 703 #                ifndef HALF_LJ
 704     r_S2    = rsq_S2 * rinv_S2;
 705     rsw_S2  = max(r_S2 - rswitch_S, zero_S);
 706     rsw2_S2 = rsw_S2 * rsw_S2;
 707     r_S3    = rsq_S3 * rinv_S3;
 708     rsw_S3  = max(r_S3 - rswitch_S, zero_S);
 709     rsw2_S3 = rsw_S3 * rsw_S3;
 710 #                endif
 711 #            endif
 712
 713 #            ifdef LJ_FORCE_SWITCH
 714
 715 #                define gmx_add_fr_switch(fr, rsw, rsw2_r, c2, c3) \
 716                     fma(fma(c3, rsw, c2), rsw2_r, (fr))
 717     SimdReal rsw2_r_S0 = rsw2_S0 * r_S0;
 718     SimdReal rsw2_r_S1 = rsw2_S1 * r_S1;
 719     FrLJ6_S0 = c6_S0 * gmx_add_fr_switch(rinvsix_S0, rsw_S0, rsw2_r_S0, p6_fc2_S, p6_fc3_S);
 720     FrLJ6_S1 = c6_S1 * gmx_add_fr_switch(rinvsix_S1, rsw_S1, rsw2_r_S1, p6_fc2_S, p6_fc3_S);
 721 #                ifndef HALF_LJ
 722     SimdReal rsw2_r_S2 = rsw2_S2 * r_S2;
 723     SimdReal rsw2_r_S3 = rsw2_S3 * r_S3;
 724     FrLJ6_S2 = c6_S2 * gmx_add_fr_switch(rinvsix_S2, rsw_S2, rsw2_r_S2, p6_fc2_S, p6_fc3_S);
 725     FrLJ6_S3 = c6_S3 * gmx_add_fr_switch(rinvsix_S3, rsw_S3, rsw2_r_S3, p6_fc2_S, p6_fc3_S);
 726 #                endif
 727     FrLJ12_S0 = c12_S0 * gmx_add_fr_switch((rinvsix_S0 * rinvsix_S0), rsw_S0, rsw2_r_S0, p12_fc2_S, p12_fc3_S);
 728     FrLJ12_S1 = c12_S1 * gmx_add_fr_switch((rinvsix_S1 * rinvsix_S1), rsw_S1, rsw2_r_S1, p12_fc2_S, p12_fc3_S);
 729 #                ifndef HALF_LJ
 730     FrLJ12_S2 = c12_S2 * gmx_add_fr_switch((rinvsix_S2 * rinvsix_S2), rsw_S2, rsw2_r_S2, p12_fc2_S, p12_fc3_S);
 731     FrLJ12_S3 = c12_S3 * gmx_add_fr_switch((rinvsix_S3 * rinvsix_S3), rsw_S3, rsw2_r_S3, p12_fc2_S, p12_fc3_S);
 732 #                endif
 733 #                undef gmx_add_fr_switch
 734 #            endif /* LJ_FORCE_SWITCH */
 735
 736 #        endif /* not LJ_COMB_LB */
 737
 738 #        ifdef LJ_COMB_LB
 739     sir_S0 = sig_S0 * rinv_S0;
 740     sir_S1 = sig_S1 * rinv_S1;
 741 #            ifndef HALF_LJ
 742     sir_S2 = sig_S2 * rinv_S2;
 743     sir_S3 = sig_S3 * rinv_S3;
 744 #            endif
 745     sir2_S0 = sir_S0 * sir_S0;
 746     sir2_S1 = sir_S1 * sir_S1;
 747 #            ifndef HALF_LJ
 748     sir2_S2 = sir_S2 * sir_S2;
 749     sir2_S3 = sir_S3 * sir_S3;
 750 #            endif
 751     sir6_S0 = sir2_S0 * sir2_S0 * sir2_S0;
 752     sir6_S1 = sir2_S1 * sir2_S1 * sir2_S1;
 753 #            ifdef EXCL_FORCES
 754     sir6_S0 = selectByMask(sir6_S0, interact_S0);
 755     sir6_S1 = selectByMask(sir6_S1, interact_S1);
 756 #            endif
 757 #            ifndef HALF_LJ
 758     sir6_S2 = sir2_S2 * sir2_S2 * sir2_S2;
 759     sir6_S3 = sir2_S3 * sir2_S3 * sir2_S3;
 760 #                ifdef EXCL_FORCES
 761     sir6_S2 = selectByMask(sir6_S2, interact_S2);
 762     sir6_S3 = selectByMask(sir6_S3, interact_S3);
 763 #                endif
 764 #            endif
 765 #            ifdef VDW_CUTOFF_CHECK
 766     sir6_S0 = selectByMask(sir6_S0, wco_vdw_S0);
 767     sir6_S1 = selectByMask(sir6_S1, wco_vdw_S1);
 768 #                ifndef HALF_LJ
 769     sir6_S2 = selectByMask(sir6_S2, wco_vdw_S2);
 770     sir6_S3 = selectByMask(sir6_S3, wco_vdw_S3);
 771 #                endif
 772 #            endif
 773     FrLJ6_S0 = eps_S0 * sir6_S0;
 774     FrLJ6_S1 = eps_S1 * sir6_S1;
 775 #            ifndef HALF_LJ
 776     FrLJ6_S2 = eps_S2 * sir6_S2;
 777     FrLJ6_S3 = eps_S3 * sir6_S3;
 778 #            endif
 779     FrLJ12_S0 = FrLJ6_S0 * sir6_S0;
 780     FrLJ12_S1 = FrLJ6_S1 * sir6_S1;
 781 #            ifndef HALF_LJ
 782     FrLJ12_S2 = FrLJ6_S2 * sir6_S2;
 783     FrLJ12_S3 = FrLJ6_S3 * sir6_S3;
 784 #            endif
 785 #            if defined CALC_ENERGIES
 786     /* We need C6 and C12 to calculate the LJ potential shift */
 787     sig2_S0 = sig_S0 * sig_S0;
 788     sig2_S1 = sig_S1 * sig_S1;
 789 #                ifndef HALF_LJ
 790     sig2_S2 = sig_S2 * sig_S2;
 791     sig2_S3 = sig_S3 * sig_S3;
 792 #                endif
 793     sig6_S0 = sig2_S0 * sig2_S0 * sig2_S0;
 794     sig6_S1 = sig2_S1 * sig2_S1 * sig2_S1;
 795 #                ifndef HALF_LJ
 796     sig6_S2 = sig2_S2 * sig2_S2 * sig2_S2;
 797     sig6_S3 = sig2_S3 * sig2_S3 * sig2_S3;
 798 #                endif
 799     SimdReal c6_S0 = eps_S0 * sig6_S0;
 800     SimdReal c6_S1 = eps_S1 * sig6_S1;
 801 #                ifndef HALF_LJ
 802     SimdReal c6_S2 = eps_S2 * sig6_S2;
 803     SimdReal c6_S3 = eps_S3 * sig6_S3;
 804 #                endif
 805     SimdReal c12_S0 = c6_S0 * sig6_S0;
 806     SimdReal c12_S1 = c6_S1 * sig6_S1;
 807 #                ifndef HALF_LJ
 808     SimdReal c12_S2 = c6_S2 * sig6_S2;
 809     SimdReal c12_S3 = c6_S3 * sig6_S3;
 810 #                endif
 811 #            endif
 812 #        endif /* LJ_COMB_LB */
 813
 814     /* Determine the total scalar LJ force*r */
 815     frLJ_S0 = FrLJ12_S0 - FrLJ6_S0;
 816     frLJ_S1 = FrLJ12_S1 - FrLJ6_S1;
 817 #        ifndef HALF_LJ
 818     frLJ_S2 = FrLJ12_S2 - FrLJ6_S2;
 819     frLJ_S3 = FrLJ12_S3 - FrLJ6_S3;
 820 #        endif
 821
 822 #        if (defined LJ_CUT || defined LJ_FORCE_SWITCH) && defined CALC_ENERGIES
 823
 824 #            ifdef LJ_CUT
 825     /* Calculate the LJ energies, with constant potential shift */
 826     SimdReal VLJ6_S0 = sixth_S * fma(c6_S0, p6_cpot_S, FrLJ6_S0);
 827     SimdReal VLJ6_S1 = sixth_S * fma(c6_S1, p6_cpot_S, FrLJ6_S1);
 828 #                ifndef HALF_LJ
 829     SimdReal VLJ6_S2 = sixth_S * fma(c6_S2, p6_cpot_S, FrLJ6_S2);
 830     SimdReal VLJ6_S3 = sixth_S * fma(c6_S3, p6_cpot_S, FrLJ6_S3);
 831 #                endif
 832     SimdReal VLJ12_S0 = twelveth_S * fma(c12_S0, p12_cpot_S, FrLJ12_S0);
 833     SimdReal VLJ12_S1 = twelveth_S * fma(c12_S1, p12_cpot_S, FrLJ12_S1);
 834 #                ifndef HALF_LJ
 835     SimdReal VLJ12_S2 = twelveth_S * fma(c12_S2, p12_cpot_S, FrLJ12_S2);
 836     SimdReal VLJ12_S3 = twelveth_S * fma(c12_S3, p12_cpot_S, FrLJ12_S3);
 837 #                endif
 838 #            endif /* LJ_CUT */
 839
 840 #            ifdef LJ_FORCE_SWITCH
 841 #                define v_fswitch_r(rsw, rsw2, c0, c3, c4) \
 842                     fma(fma((c4), (rsw), (c3)), ((rsw2) * (rsw)), (c0))
 843
 844     SimdReal VLJ6_S0 =
 845             c6_S0 * fma(sixth_S, rinvsix_S0, v_fswitch_r(rsw_S0, rsw2_S0, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
 846     SimdReal VLJ6_S1 =
 847             c6_S1 * fma(sixth_S, rinvsix_S1, v_fswitch_r(rsw_S1, rsw2_S1, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
 848 #                ifndef HALF_LJ
 849     SimdReal VLJ6_S2 =
 850             c6_S2 * fma(sixth_S, rinvsix_S2, v_fswitch_r(rsw_S2, rsw2_S2, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
 851     SimdReal VLJ6_S3 =
 852             c6_S3 * fma(sixth_S, rinvsix_S3, v_fswitch_r(rsw_S3, rsw2_S3, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
 853 #                endif
 854     SimdReal VLJ12_S0 = c12_S0
 855                         * fma(twelveth_S, rinvsix_S0 * rinvsix_S0,
 856                               v_fswitch_r(rsw_S0, rsw2_S0, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
 857     SimdReal VLJ12_S1 = c12_S1
 858                         * fma(twelveth_S, rinvsix_S1 * rinvsix_S1,
 859                               v_fswitch_r(rsw_S1, rsw2_S1, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
 860 #                ifndef HALF_LJ
 861     SimdReal VLJ12_S2 = c12_S2
 862                         * fma(twelveth_S, rinvsix_S2 * rinvsix_S2,
 863                               v_fswitch_r(rsw_S2, rsw2_S2, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
 864     SimdReal VLJ12_S3 = c12_S3
 865                         * fma(twelveth_S, rinvsix_S3 * rinvsix_S3,
 866                               v_fswitch_r(rsw_S3, rsw2_S3, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
 867 #                endif
 868 #                undef v_fswitch_r
 869 #            endif /* LJ_FORCE_SWITCH */
 870
 871     /* Add up the repulsion and dispersion */
 872     SimdReal VLJ_S0 = VLJ12_S0 - VLJ6_S0;
 873     SimdReal VLJ_S1 = VLJ12_S1 - VLJ6_S1;
 874 #            ifndef HALF_LJ
 875     SimdReal VLJ_S2 = VLJ12_S2 - VLJ6_S2;
 876     SimdReal VLJ_S3 = VLJ12_S3 - VLJ6_S3;
 877 #            endif
 878
 879 #        endif /* (LJ_CUT || LJ_FORCE_SWITCH) && CALC_ENERGIES */
 880
 881 #        ifdef LJ_POT_SWITCH
 882     /* We always need the potential, since it is needed for the force */
 883     SimdReal VLJ_S0 = fnma(sixth_S, FrLJ6_S0, twelveth_S * FrLJ12_S0);
 884     SimdReal VLJ_S1 = fnma(sixth_S, FrLJ6_S1, twelveth_S * FrLJ12_S1);
 885 #            ifndef HALF_LJ
 886     SimdReal VLJ_S2 = fnma(sixth_S, FrLJ6_S2, twelveth_S * FrLJ12_S2);
 887     SimdReal VLJ_S3 = fnma(sixth_S, FrLJ6_S3, twelveth_S * FrLJ12_S3);
 888 #            endif
 889
 890     {
 891         SimdReal sw_S0, dsw_S0;
 892         SimdReal sw_S1, dsw_S1;
 893 #            ifndef HALF_LJ
 894         SimdReal sw_S2, dsw_S2;
 895         SimdReal sw_S3, dsw_S3;
 896 #            endif
 897
 898 #            define switch_r(rsw, rsw2, c3, c4, c5) \
 899                 fma(fma(fma(c5, rsw, c4), rsw, c3), ((rsw2) * (rsw)), one_S)
 900 #            define dswitch_r(rsw, rsw2, c2, c3, c4) (fma(fma(c4, rsw, c3), rsw, c2) * (rsw2))
 901
 902         sw_S0  = switch_r(rsw_S0, rsw2_S0, swV3_S, swV4_S, swV5_S);
 903         dsw_S0 = dswitch_r(rsw_S0, rsw2_S0, swF2_S, swF3_S, swF4_S);
 904         sw_S1  = switch_r(rsw_S1, rsw2_S1, swV3_S, swV4_S, swV5_S);
 905         dsw_S1 = dswitch_r(rsw_S1, rsw2_S1, swF2_S, swF3_S, swF4_S);
 906 #            ifndef HALF_LJ
 907         sw_S2  = switch_r(rsw_S2, rsw2_S2, swV3_S, swV4_S, swV5_S);
 908         dsw_S2 = dswitch_r(rsw_S2, rsw2_S2, swF2_S, swF3_S, swF4_S);
 909         sw_S3  = switch_r(rsw_S3, rsw2_S3, swV3_S, swV4_S, swV5_S);
 910         dsw_S3 = dswitch_r(rsw_S3, rsw2_S3, swF2_S, swF3_S, swF4_S);
 911 #            endif
 912         frLJ_S0 = fnma(dsw_S0 * VLJ_S0, r_S0, sw_S0 * frLJ_S0);
 913         frLJ_S1 = fnma(dsw_S1 * VLJ_S1, r_S1, sw_S1 * frLJ_S1);
 914 #            ifndef HALF_LJ
 915         frLJ_S2 = fnma(dsw_S2 * VLJ_S2, r_S2, sw_S2 * frLJ_S2);
 916         frLJ_S3 = fnma(dsw_S3 * VLJ_S3, r_S3, sw_S3 * frLJ_S3);
 917 #            endif
 918 #            ifdef CALC_ENERGIES
 919         VLJ_S0 = sw_S0 * VLJ_S0;
 920         VLJ_S1 = sw_S1 * VLJ_S1;
 921 #                ifndef HALF_LJ
 922         VLJ_S2 = sw_S2 * VLJ_S2;
 923         VLJ_S3 = sw_S3 * VLJ_S3;
 924 #                endif
 925 #            endif
 926
 927 #            undef switch_r
 928 #            undef dswitch_r
 929     }
 930 #        endif /* LJ_POT_SWITCH */
 931
 932 #        if defined CALC_ENERGIES && defined CHECK_EXCLS
 933     /* The potential shift should be removed for excluded pairs */
 934     VLJ_S0 = selectByMask(VLJ_S0, interact_S0);
 935     VLJ_S1 = selectByMask(VLJ_S1, interact_S1);
 936 #            ifndef HALF_LJ
 937     VLJ_S2 = selectByMask(VLJ_S2, interact_S2);
 938     VLJ_S3 = selectByMask(VLJ_S3, interact_S3);
 939 #            endif
 940 #        endif
 941
 942 #        ifdef LJ_EWALD_GEOM
 943     {
 944         SimdReal c6s_j_S;
 945         SimdReal c6grid_S0, rinvsix_nm_S0, cr2_S0, expmcr2_S0, poly_S0;
 946         SimdReal c6grid_S1, rinvsix_nm_S1, cr2_S1, expmcr2_S1, poly_S1;
 947 #            ifndef HALF_LJ
 948         SimdReal c6grid_S2, rinvsix_nm_S2, cr2_S2, expmcr2_S2, poly_S2;
 949         SimdReal c6grid_S3, rinvsix_nm_S3, cr2_S3, expmcr2_S3, poly_S3;
 950 #            endif
 951 #            ifdef CALC_ENERGIES
 952         SimdReal sh_mask_S0;
 953         SimdReal sh_mask_S1;
 954 #                ifndef HALF_LJ
 955         SimdReal sh_mask_S2;
 956         SimdReal sh_mask_S3;
 957 #                endif
 958 #            endif
 959
 960         /* Determine C6 for the grid using the geometric combination rule */
 961         c6s_j_S   = load<SimdReal>(ljc + aj2 + 0);
 962         c6grid_S0 = c6s_S0 * c6s_j_S;
 963         c6grid_S1 = c6s_S1 * c6s_j_S;
 964 #            ifndef HALF_LJ
 965         c6grid_S2 = c6s_S2 * c6s_j_S;
 966         c6grid_S3 = c6s_S3 * c6s_j_S;
 967 #            endif
 968
 969 #            ifdef CHECK_EXCLS
 970         /* Recalculate rinvsix without exclusion mask (compiler might optimize) */
 971         rinvsix_nm_S0 = rinvsq_S0 * rinvsq_S0 * rinvsq_S0;
 972         rinvsix_nm_S1 = rinvsq_S1 * rinvsq_S1 * rinvsq_S1;
 973 #                ifndef HALF_LJ
 974         rinvsix_nm_S2 = rinvsq_S2 * rinvsq_S2 * rinvsq_S2;
 975         rinvsix_nm_S3 = rinvsq_S3 * rinvsq_S3 * rinvsq_S3;
 976 #                endif
 977 #            else
 978         /* We didn't use a mask, so we can copy */
 979         rinvsix_nm_S0 = rinvsix_S0;
 980         rinvsix_nm_S1 = rinvsix_S1;
 981 #                ifndef HALF_LJ
 982         rinvsix_nm_S2 = rinvsix_S2;
 983         rinvsix_nm_S3 = rinvsix_S3;
 984 #                endif
 985 #            endif
 986
 987         /* Mask for the cut-off to avoid overflow of cr2^2 */
 988         cr2_S0 = lje_c2_S * selectByMask(rsq_S0, wco_vdw_S0);
 989         cr2_S1 = lje_c2_S * selectByMask(rsq_S1, wco_vdw_S1);
 990 #            ifndef HALF_LJ
 991         cr2_S2 = lje_c2_S * selectByMask(rsq_S2, wco_vdw_S2);
 992         cr2_S3 = lje_c2_S * selectByMask(rsq_S3, wco_vdw_S3);
 993 #            endif
 994         // Unsafe version of our exp() should be fine, since these arguments should never
 995         // be smaller than -127 for any reasonable choice of cutoff or ewald coefficients.
 996         expmcr2_S0 = exp<MathOptimization::Unsafe>(-cr2_S0);
 997         expmcr2_S1 = exp<MathOptimization::Unsafe>(-cr2_S1);
 998 #            ifndef HALF_LJ
 999         expmcr2_S2 = exp<MathOptimization::Unsafe>(-cr2_S2);
1000         expmcr2_S3 = exp<MathOptimization::Unsafe>(-cr2_S3);
1001 #            endif
1002
1003         /* 1 + cr2 + 1/2*cr2^2 */
1004         poly_S0 = fma(fma(half_S, cr2_S0, one_S), cr2_S0, one_S);
1005         poly_S1 = fma(fma(half_S, cr2_S1, one_S), cr2_S1, one_S);
1006 #            ifndef HALF_LJ
1007         poly_S2 = fma(fma(half_S, cr2_S2, one_S), cr2_S2, one_S);
1008         poly_S3 = fma(fma(half_S, cr2_S3, one_S), cr2_S3, one_S);
1009 #            endif
1010
1011         /* We calculate LJ F*r = (6*C6)*(r^-6 - F_mesh/6), we use:
1012          * r^-6*cexp*(1 + cr2 + cr2^2/2 + cr2^3/6) = cexp*(r^-6*poly + c^6/6)
1013          */
1014         frLJ_S0 = fma(c6grid_S0,
1015                       fnma(expmcr2_S0, fma(rinvsix_nm_S0, poly_S0, lje_c6_6_S), rinvsix_nm_S0), frLJ_S0);
1016         frLJ_S1 = fma(c6grid_S1,
1017                       fnma(expmcr2_S1, fma(rinvsix_nm_S1, poly_S1, lje_c6_6_S), rinvsix_nm_S1), frLJ_S1);
1018 #            ifndef HALF_LJ
1019         frLJ_S2 = fma(c6grid_S2,
1020                       fnma(expmcr2_S2, fma(rinvsix_nm_S2, poly_S2, lje_c6_6_S), rinvsix_nm_S2), frLJ_S2);
1021         frLJ_S3 = fma(c6grid_S3,
1022                       fnma(expmcr2_S3, fma(rinvsix_nm_S3, poly_S3, lje_c6_6_S), rinvsix_nm_S3), frLJ_S3);
1023 #            endif
1024
1025 #            ifdef CALC_ENERGIES
1026 #                ifdef CHECK_EXCLS
1027         sh_mask_S0 = selectByMask(lje_vc_S, interact_S0);
1028         sh_mask_S1 = selectByMask(lje_vc_S, interact_S1);
1029 #                    ifndef HALF_LJ
1030         sh_mask_S2 = selectByMask(lje_vc_S, interact_S2);
1031         sh_mask_S3 = selectByMask(lje_vc_S, interact_S3);
1032 #                    endif
1033 #                else
1034         sh_mask_S0 = lje_vc_S;
1035         sh_mask_S1 = lje_vc_S;
1036 #                    ifndef HALF_LJ
1037         sh_mask_S2 = lje_vc_S;
1038         sh_mask_S3 = lje_vc_S;
1039 #                    endif
1040 #                endif
1041
1042         VLJ_S0 = fma(sixth_S * c6grid_S0,
1043                      fma(rinvsix_nm_S0, fnma(expmcr2_S0, poly_S0, one_S), sh_mask_S0), VLJ_S0);
1044         VLJ_S1 = fma(sixth_S * c6grid_S1,
1045                      fma(rinvsix_nm_S1, fnma(expmcr2_S1, poly_S1, one_S), sh_mask_S1), VLJ_S1);
1046 #                ifndef HALF_LJ
1047         VLJ_S2 = fma(sixth_S * c6grid_S2,
1048                      fma(rinvsix_nm_S2, fnma(expmcr2_S2, poly_S2, one_S), sh_mask_S2), VLJ_S2);
1049         VLJ_S3 = fma(sixth_S * c6grid_S3,
1050                      fma(rinvsix_nm_S3, fnma(expmcr2_S3, poly_S3, one_S), sh_mask_S3), VLJ_S3);
1051 #                endif
1052 #            endif /* CALC_ENERGIES */
1053     }
1054 #        endif /* LJ_EWALD_GEOM */
1055
1056 #        if defined VDW_CUTOFF_CHECK
1057     /* frLJ is multiplied later by rinvsq, which is masked for the Coulomb
1058      * cut-off, but if the VdW cut-off is shorter, we need to mask with that.
1059      */
1060     frLJ_S0 = selectByMask(frLJ_S0, wco_vdw_S0);
1061     frLJ_S1 = selectByMask(frLJ_S1, wco_vdw_S1);
1062 #            ifndef HALF_LJ
1063     frLJ_S2 = selectByMask(frLJ_S2, wco_vdw_S2);
1064     frLJ_S3 = selectByMask(frLJ_S3, wco_vdw_S3);
1065 #            endif
1066 #        endif
1067
1068 #        ifdef CALC_ENERGIES
1069     /* The potential shift should be removed for pairs beyond cut-off */
1070     VLJ_S0 = selectByMask(VLJ_S0, wco_vdw_S0);
1071     VLJ_S1 = selectByMask(VLJ_S1, wco_vdw_S1);
1072 #            ifndef HALF_LJ
1073     VLJ_S2 = selectByMask(VLJ_S2, wco_vdw_S2);
1074     VLJ_S3 = selectByMask(VLJ_S3, wco_vdw_S3);
1075 #            endif
1076 #        endif
1077
1078 #    endif /* CALC_LJ */
1079
1080 #    ifdef CALC_ENERGIES
1081 #        ifdef ENERGY_GROUPS
1082     /* Extract the group pair index per j pair.
1083      * Energy groups are stored per i-cluster, so things get
1084      * complicated when the i- and j-cluster size don't match.
1085      */
1086     {
1087         int egps_j;
1088 #            if UNROLLJ == 2
1089         egps_j    = nbatParams.energrp[cj >> 1];
1090         egp_jj[0] = ((egps_j >> ((cj & 1) * egps_jshift)) & egps_jmask) * egps_jstride;
1091 #            else
1092         /* We assume UNROLLI <= UNROLLJ */
1093         int jdi;
1094         for (jdi = 0; jdi < UNROLLJ / UNROLLI; jdi++)
1095         {
1096             int jj;
1097             egps_j = nbatParams.energrp[cj * (UNROLLJ / UNROLLI) + jdi];
1098             for (jj = 0; jj < (UNROLLI / 2); jj++)
1099             {
1100                 egp_jj[jdi * (UNROLLI / 2) + jj] =
1101                         ((egps_j >> (jj * egps_jshift)) & egps_jmask) * egps_jstride;
1102             }
1103         }
1104 #            endif
1105     }
1106 #        endif
1107
1108 #        ifdef CALC_COULOMB
1109 #            ifndef ENERGY_GROUPS
1110     vctot_S = vctot_S + vcoul_S0 + vcoul_S1 + vcoul_S2 + vcoul_S3;
1111 #            else
1112     add_ener_grp(vcoul_S0, vctp[0], egp_jj);
1113     add_ener_grp(vcoul_S1, vctp[1], egp_jj);
1114     add_ener_grp(vcoul_S2, vctp[2], egp_jj);
1115     add_ener_grp(vcoul_S3, vctp[3], egp_jj);
1116 #            endif
1117 #        endif
1118
1119 #        ifdef CALC_LJ
1120
1121 #            ifndef ENERGY_GROUPS
1122 #                ifndef HALF_LJ
1123     Vvdwtot_S = Vvdwtot_S + VLJ_S0 + VLJ_S1 + VLJ_S2 + VLJ_S3;
1124 #                else
1125     Vvdwtot_S = Vvdwtot_S + VLJ_S0 + VLJ_S1;
1126 #                endif
1127 #            else
1128     add_ener_grp(VLJ_S0, vvdwtp[0], egp_jj);
1129     add_ener_grp(VLJ_S1, vvdwtp[1], egp_jj);
1130 #                ifndef HALF_LJ
1131     add_ener_grp(VLJ_S2, vvdwtp[2], egp_jj);
1132     add_ener_grp(VLJ_S3, vvdwtp[3], egp_jj);
1133 #                endif
1134 #            endif
1135 #        endif /* CALC_LJ */
1136 #    endif     /* CALC_ENERGIES */
1137
1138 #    ifdef CALC_LJ
1139 #        ifdef CALC_COULOMB
1140     fscal_S0 = rinvsq_S0 * (frcoul_S0 + frLJ_S0);
1141 #        else
1142     fscal_S0 = rinvsq_S0 * frLJ_S0;
1143 #        endif
1144 #        ifdef CALC_COULOMB
1145     fscal_S1 = rinvsq_S1 * (frcoul_S1 + frLJ_S1);
1146 #        else
1147     fscal_S1 = rinvsq_S1 * frLJ_S1;
1148 #        endif
1149 #    else
1150     fscal_S0 = rinvsq_S0 * frcoul_S0;
1151     fscal_S1 = rinvsq_S1 * frcoul_S1;
1152 #    endif /* CALC_LJ */
1153 #    if defined CALC_LJ && !defined HALF_LJ
1154 #        ifdef CALC_COULOMB
1155     fscal_S2 = rinvsq_S2 * (frcoul_S2 + frLJ_S2);
1156     fscal_S3 = rinvsq_S3 * (frcoul_S3 + frLJ_S3);
1157 #        else
1158     fscal_S2 = rinvsq_S2 * frLJ_S2;
1159     fscal_S3 = rinvsq_S3 * frLJ_S3;
1160 #        endif
1161 #    else
1162     /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
1163     fscal_S2 = rinvsq_S2 * frcoul_S2;
1164     fscal_S3 = rinvsq_S3 * frcoul_S3;
1165 #    endif
1166
1167     /* Calculate temporary vectorial force */
1168     tx_S0 = fscal_S0 * dx_S0;
1169     tx_S1 = fscal_S1 * dx_S1;
1170     tx_S2 = fscal_S2 * dx_S2;
1171     tx_S3 = fscal_S3 * dx_S3;
1172     ty_S0 = fscal_S0 * dy_S0;
1173     ty_S1 = fscal_S1 * dy_S1;
1174     ty_S2 = fscal_S2 * dy_S2;
1175     ty_S3 = fscal_S3 * dy_S3;
1176     tz_S0 = fscal_S0 * dz_S0;
1177     tz_S1 = fscal_S1 * dz_S1;
1178     tz_S2 = fscal_S2 * dz_S2;
1179     tz_S3 = fscal_S3 * dz_S3;
1180
1181     /* Increment i atom force */
1182     fix_S0 = fix_S0 + tx_S0;
1183     fix_S1 = fix_S1 + tx_S1;
1184     fix_S2 = fix_S2 + tx_S2;
1185     fix_S3 = fix_S3 + tx_S3;
1186     fiy_S0 = fiy_S0 + ty_S0;
1187     fiy_S1 = fiy_S1 + ty_S1;
1188     fiy_S2 = fiy_S2 + ty_S2;
1189     fiy_S3 = fiy_S3 + ty_S3;
1190     fiz_S0 = fiz_S0 + tz_S0;
1191     fiz_S1 = fiz_S1 + tz_S1;
1192     fiz_S2 = fiz_S2 + tz_S2;
1193     fiz_S3 = fiz_S3 + tz_S3;
1194
1195     /* Decrement j atom force */
1196     store(f + ajx, load<SimdReal>(f + ajx) - (tx_S0 + tx_S1 + tx_S2 + tx_S3));
1197     store(f + ajy, load<SimdReal>(f + ajy) - (ty_S0 + ty_S1 + ty_S2 + ty_S3));
1198     store(f + ajz, load<SimdReal>(f + ajz) - (tz_S0 + tz_S1 + tz_S2 + tz_S3));
1199 }
1200
1201 #    undef rinv_ex_S0
1202 #    undef rinv_ex_S1
1203 #    undef rinv_ex_S2
1204 #    undef rinv_ex_S3
1205
1206 #    undef wco_vdw_S0
1207 #    undef wco_vdw_S1
1208 #    undef wco_vdw_S2
1209 #    undef wco_vdw_S3
1210
1211 #    undef EXCL_FORCES
1212
1213 #endif // !DOXYGEN