src/gromacs/nbnxm/kernels_simd_4xm/kernel_inner.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /* Doxygen gets confused (buggy) about the block in this file in combination with
  37  * the  namespace prefix, and thinks store is documented here.
  38  * This will solve itself with the second-generation nbnxn kernels, so for now
  39  * we just tell Doxygen to stay out.
  40  */
  41 #ifndef DOXYGEN
  42
  43 /* This is the innermost loop contents for the 4 x N atom simd kernel.
  44  * This flavor of the kernel calculates interactions of 4 i-atoms
  45  * with N j-atoms stored in N wide simd registers.
  46  */
  47
  48 /* When calculating RF or Ewald interactions we calculate the electrostatic/LJ
  49  * forces on excluded atom pairs here in the non-bonded loops.
  50  * But when energies and/or virial is required we calculate them
  51  * separately to as then it is easier to separate the energy and virial
  52  * contributions.
  53  */
  54 #if defined CHECK_EXCLS && (defined CALC_COULOMB || defined LJ_EWALD_GEOM)
  55 #define EXCL_FORCES
  56 #endif
  57
  58 {
  59     int            cj, ajx, ajy, ajz;
  60     int gmx_unused aj;
  61
  62 #ifdef ENERGY_GROUPS
  63     /* Energy group indices for two atoms packed into one int */
  64     int        egp_jj[UNROLLJ/2];
  65 #endif
  66
  67 #ifdef CHECK_EXCLS
  68     /* Interaction (non-exclusion) mask of all 1's or 0's */
  69     SimdBool  interact_S0;
  70     SimdBool  interact_S1;
  71     SimdBool  interact_S2;
  72     SimdBool  interact_S3;
  73 #endif
  74
  75     SimdReal  jx_S, jy_S, jz_S;
  76     SimdReal  dx_S0, dy_S0, dz_S0;
  77     SimdReal  dx_S1, dy_S1, dz_S1;
  78     SimdReal  dx_S2, dy_S2, dz_S2;
  79     SimdReal  dx_S3, dy_S3, dz_S3;
  80     SimdReal  tx_S0, ty_S0, tz_S0;
  81     SimdReal  tx_S1, ty_S1, tz_S1;
  82     SimdReal  tx_S2, ty_S2, tz_S2;
  83     SimdReal  tx_S3, ty_S3, tz_S3;
  84     SimdReal  rsq_S0, rinv_S0, rinvsq_S0;
  85     SimdReal  rsq_S1, rinv_S1, rinvsq_S1;
  86     SimdReal  rsq_S2, rinv_S2, rinvsq_S2;
  87     SimdReal  rsq_S3, rinv_S3, rinvsq_S3;
  88
  89     /* wco: within cut-off, mask of all 1's or 0's */
  90     SimdBool  wco_S0;
  91     SimdBool  wco_S1;
  92     SimdBool  wco_S2;
  93     SimdBool  wco_S3;
  94
  95 #ifdef VDW_CUTOFF_CHECK
  96     SimdBool  wco_vdw_S0;
  97     SimdBool  wco_vdw_S1;
  98 #ifndef HALF_LJ
  99     SimdBool  wco_vdw_S2;
 100     SimdBool  wco_vdw_S3;
 101 #endif
 102 #endif
 103
 104 #if (defined CALC_COULOMB && defined CALC_COUL_TAB) || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
 105     SimdReal r_S0;
 106     SimdReal r_S1;
 107 #if (defined CALC_COULOMB && defined CALC_COUL_TAB) || !defined HALF_LJ
 108     SimdReal r_S2;
 109     SimdReal r_S3;
 110 #endif
 111 #endif
 112
 113 #if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
 114     SimdReal  rsw_S0, rsw2_S0;
 115     SimdReal  rsw_S1, rsw2_S1;
 116 #ifndef HALF_LJ
 117     SimdReal  rsw_S2, rsw2_S2;
 118     SimdReal  rsw_S3, rsw2_S3;
 119 #endif
 120 #endif
 121
 122 #ifdef CALC_COULOMB
 123 #ifdef CHECK_EXCLS
 124     /* 1/r masked with the interaction mask */
 125     SimdReal  rinv_ex_S0;
 126     SimdReal  rinv_ex_S1;
 127     SimdReal  rinv_ex_S2;
 128     SimdReal  rinv_ex_S3;
 129 #endif
 130     SimdReal  jq_S;
 131     SimdReal  qq_S0;
 132     SimdReal  qq_S1;
 133     SimdReal  qq_S2;
 134     SimdReal  qq_S3;
 135 #ifdef CALC_COUL_TAB
 136     /* The force (PME mesh force) we need to subtract from 1/r^2 */
 137     SimdReal  fsub_S0;
 138     SimdReal  fsub_S1;
 139     SimdReal  fsub_S2;
 140     SimdReal  fsub_S3;
 141 #endif
 142 #ifdef CALC_COUL_EWALD
 143     SimdReal  brsq_S0, brsq_S1, brsq_S2, brsq_S3;
 144     SimdReal  ewcorr_S0, ewcorr_S1, ewcorr_S2, ewcorr_S3;
 145 #endif
 146
 147     /* frcoul = (1/r - fsub)*r */
 148     SimdReal  frcoul_S0;
 149     SimdReal  frcoul_S1;
 150     SimdReal  frcoul_S2;
 151     SimdReal  frcoul_S3;
 152 #ifdef CALC_COUL_TAB
 153     /* For tables: r, rs=r/sp, rf=floor(rs), frac=rs-rf */
 154     SimdReal         rs_S0, rf_S0, frac_S0;
 155     SimdReal         rs_S1, rf_S1, frac_S1;
 156     SimdReal         rs_S2, rf_S2, frac_S2;
 157     SimdReal         rs_S3, rf_S3, frac_S3;
 158     /* Table index: rs truncated to an int */
 159     SimdInt32        ti_S0, ti_S1, ti_S2, ti_S3;
 160     /* Linear force table values */
 161     SimdReal         ctab0_S0, ctab1_S0;
 162     SimdReal         ctab0_S1, ctab1_S1;
 163     SimdReal         ctab0_S2, ctab1_S2;
 164     SimdReal         ctab0_S3, ctab1_S3;
 165 #ifdef CALC_ENERGIES
 166     /* Quadratic energy table value */
 167     SimdReal  ctabv_S0, dum_S0;
 168     SimdReal  ctabv_S1, dum_S1;
 169     SimdReal  ctabv_S2, dum_S2;
 170     SimdReal  ctabv_S3, dum_S3;
 171 #endif
 172 #endif
 173 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 174     /* The potential (PME mesh) we need to subtract from 1/r */
 175     SimdReal  vc_sub_S0;
 176     SimdReal  vc_sub_S1;
 177     SimdReal  vc_sub_S2;
 178     SimdReal  vc_sub_S3;
 179 #endif
 180 #ifdef CALC_ENERGIES
 181     /* Electrostatic potential */
 182     SimdReal  vcoul_S0;
 183     SimdReal  vcoul_S1;
 184     SimdReal  vcoul_S2;
 185     SimdReal  vcoul_S3;
 186 #endif
 187 #endif
 188     /* The force times 1/r */
 189     SimdReal  fscal_S0;
 190     SimdReal  fscal_S1;
 191     SimdReal  fscal_S2;
 192     SimdReal  fscal_S3;
 193
 194 #ifdef CALC_LJ
 195 #ifdef LJ_COMB_LB
 196     /* LJ sigma_j/2 and sqrt(epsilon_j) */
 197     SimdReal  hsig_j_S, seps_j_S;
 198     /* LJ sigma_ij and epsilon_ij */
 199     SimdReal  sig_S0, eps_S0;
 200     SimdReal  sig_S1, eps_S1;
 201 #ifndef HALF_LJ
 202     SimdReal  sig_S2, eps_S2;
 203     SimdReal  sig_S3, eps_S3;
 204 #endif
 205 #ifdef CALC_ENERGIES
 206     SimdReal  sig2_S0, sig6_S0;
 207     SimdReal  sig2_S1, sig6_S1;
 208 #ifndef HALF_LJ
 209     SimdReal  sig2_S2, sig6_S2;
 210     SimdReal  sig2_S3, sig6_S3;
 211 #endif
 212 #endif /* LJ_COMB_LB */
 213 #endif /* CALC_LJ */
 214
 215 #ifdef LJ_COMB_GEOM
 216     SimdReal  c6s_j_S, c12s_j_S;
 217 #endif
 218
 219 #if defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM
 220     /* Index for loading LJ parameters, complicated when interleaving */
 221     int         aj2;
 222 #endif
 223
 224     /* Intermediate variables for LJ calculation */
 225 #ifndef LJ_COMB_LB
 226     SimdReal  rinvsix_S0;
 227     SimdReal  rinvsix_S1;
 228 #ifndef HALF_LJ
 229     SimdReal  rinvsix_S2;
 230     SimdReal  rinvsix_S3;
 231 #endif
 232 #endif
 233 #ifdef LJ_COMB_LB
 234     SimdReal  sir_S0, sir2_S0, sir6_S0;
 235     SimdReal  sir_S1, sir2_S1, sir6_S1;
 236 #ifndef HALF_LJ
 237     SimdReal  sir_S2, sir2_S2, sir6_S2;
 238     SimdReal  sir_S3, sir2_S3, sir6_S3;
 239 #endif
 240 #endif
 241
 242     SimdReal  FrLJ6_S0, FrLJ12_S0, frLJ_S0;
 243     SimdReal  FrLJ6_S1, FrLJ12_S1, frLJ_S1;
 244 #ifndef HALF_LJ
 245     SimdReal  FrLJ6_S2, FrLJ12_S2, frLJ_S2;
 246     SimdReal  FrLJ6_S3, FrLJ12_S3, frLJ_S3;
 247 #endif
 248 #endif /* CALC_LJ */
 249
 250     /* j-cluster index */
 251     cj            = l_cj[cjind].cj;
 252
 253     /* Atom indices (of the first atom in the cluster) */
 254     aj            = cj*UNROLLJ;
 255 #if defined CALC_LJ && (defined LJ_COMB_GEOM || defined LJ_COMB_LB || defined LJ_EWALD_GEOM)
 256 #if UNROLLJ == STRIDE
 257     aj2           = aj*2;
 258 #else
 259     aj2           = (cj>>1)*2*STRIDE + (cj & 1)*UNROLLJ;
 260 #endif
 261 #endif
 262 #if UNROLLJ == STRIDE
 263     ajx           = aj*DIM;
 264 #else
 265     ajx           = (cj>>1)*DIM*STRIDE + (cj & 1)*UNROLLJ;
 266 #endif
 267     ajy           = ajx + STRIDE;
 268     ajz           = ajy + STRIDE;
 269
 270 #ifdef CHECK_EXCLS
 271     gmx_load_simd_4xn_interactions(l_cj[cjind].excl,
 272                                    filter_S0, filter_S1,
 273                                    filter_S2, filter_S3,
 274                                    nbat->simdMasks.interaction_array.data(),
 275                                    &interact_S0, &interact_S1,
 276                                    &interact_S2, &interact_S3);
 277 #endif /* CHECK_EXCLS */
 278
 279     /* load j atom coordinates */
 280     jx_S        = load<SimdReal>(x+ajx);
 281     jy_S        = load<SimdReal>(x+ajy);
 282     jz_S        = load<SimdReal>(x+ajz);
 283
 284     /* Calculate distance */
 285     dx_S0       = ix_S0 - jx_S;
 286     dy_S0       = iy_S0 - jy_S;
 287     dz_S0       = iz_S0 - jz_S;
 288     dx_S1       = ix_S1 - jx_S;
 289     dy_S1       = iy_S1 - jy_S;
 290     dz_S1       = iz_S1 - jz_S;
 291     dx_S2       = ix_S2 - jx_S;
 292     dy_S2       = iy_S2 - jy_S;
 293     dz_S2       = iz_S2 - jz_S;
 294     dx_S3       = ix_S3 - jx_S;
 295     dy_S3       = iy_S3 - jy_S;
 296     dz_S3       = iz_S3 - jz_S;
 297
 298     /* rsq = dx*dx+dy*dy+dz*dz */
 299     rsq_S0      = norm2(dx_S0, dy_S0, dz_S0);
 300     rsq_S1      = norm2(dx_S1, dy_S1, dz_S1);
 301     rsq_S2      = norm2(dx_S2, dy_S2, dz_S2);
 302     rsq_S3      = norm2(dx_S3, dy_S3, dz_S3);
 303
 304     /* Do the cut-off check */
 305     wco_S0      = (rsq_S0 < rc2_S);
 306     wco_S1      = (rsq_S1 < rc2_S);
 307     wco_S2      = (rsq_S2 < rc2_S);
 308     wco_S3      = (rsq_S3 < rc2_S);
 309
 310 #ifdef CHECK_EXCLS
 311 #ifdef EXCL_FORCES
 312     /* Only remove the (sub-)diagonal to avoid double counting */
 313 #if UNROLLJ == UNROLLI
 314     if (cj == ci_sh)
 315     {
 316         wco_S0  = wco_S0 && diagonal_mask_S0;
 317         wco_S1  = wco_S1 && diagonal_mask_S1;
 318         wco_S2  = wco_S2 && diagonal_mask_S2;
 319         wco_S3  = wco_S3 && diagonal_mask_S3;
 320     }
 321 #else
 322 #if UNROLLJ < UNROLLI
 323     if (cj == ci_sh*2)
 324     {
 325         wco_S0  = wco_S0 && diagonal_mask0_S0;
 326         wco_S1  = wco_S1 && diagonal_mask0_S1;
 327         wco_S2  = wco_S2 && diagonal_mask0_S2;
 328         wco_S3  = wco_S3 && diagonal_mask0_S3;
 329     }
 330     if (cj == ci_sh*2 + 1)
 331     {
 332         wco_S0  = wco_S0 && diagonal_mask1_S0;
 333         wco_S1  = wco_S1 && diagonal_mask1_S1;
 334         wco_S2  = wco_S2 && diagonal_mask1_S2;
 335         wco_S3  = wco_S3 && diagonal_mask1_S3;
 336     }
 337 #else
 338     if (cj*2 == ci_sh)
 339     {
 340         wco_S0  = wco_S0 && diagonal_mask0_S0;
 341         wco_S1  = wco_S1 && diagonal_mask0_S1;
 342         wco_S2  = wco_S2 && diagonal_mask0_S2;
 343         wco_S3  = wco_S3 && diagonal_mask0_S3;
 344     }
 345     else if (cj*2 + 1 == ci_sh)
 346     {
 347         wco_S0  = wco_S0 && diagonal_mask1_S0;
 348         wco_S1  = wco_S1 && diagonal_mask1_S1;
 349         wco_S2  = wco_S2 && diagonal_mask1_S2;
 350         wco_S3  = wco_S3 && diagonal_mask1_S3;
 351     }
 352 #endif
 353 #endif
 354 #else /* EXCL_FORCES */
 355       /* No exclusion forces: remove all excluded atom pairs from the list */
 356     wco_S0      = wco_S0 && interact_S0;
 357     wco_S1      = wco_S1 && interact_S1;
 358     wco_S2      = wco_S2 && interact_S2;
 359     wco_S3      = wco_S3 && interact_S3;
 360 #endif
 361 #endif
 362
 363 #ifdef COUNT_PAIRS
 364     {
 365         int  i, j;
 366         alignas(GMX_SIMD_ALIGNMENT) real  tmp[2*GMX_SIMD_REAL_WIDTH];
 367
 368         for (i = 0; i < UNROLLI; i++)
 369         {
 370             store(tmp, rc2_S - (i == 0 ? rsq_S0 : (i == 1 ? rsq_S1 : (i == 2 ? rsq_S2 : rsq_S3))));
 371             for (j = 0; j < UNROLLJ; j++)
 372             {
 373                 if (tmp[j] >= 0)
 374                 {
 375                     npair++;
 376                 }
 377             }
 378         }
 379     }
 380 #endif
 381
 382     // Ensure the distances do not fall below the limit where r^-12 overflows.
 383     // This should never happen for normal interactions.
 384     rsq_S0      = max(rsq_S0, minRsq_S);
 385     rsq_S1      = max(rsq_S1, minRsq_S);
 386     rsq_S2      = max(rsq_S2, minRsq_S);
 387     rsq_S3      = max(rsq_S3, minRsq_S);
 388
 389     /* Calculate 1/r */
 390 #if !GMX_DOUBLE
 391     rinv_S0     = invsqrt(rsq_S0);
 392     rinv_S1     = invsqrt(rsq_S1);
 393     rinv_S2     = invsqrt(rsq_S2);
 394     rinv_S3     = invsqrt(rsq_S3);
 395 #else
 396     invsqrtPair(rsq_S0, rsq_S1, &rinv_S0, &rinv_S1);
 397     invsqrtPair(rsq_S2, rsq_S3, &rinv_S2, &rinv_S3);
 398 #endif
 399
 400 #ifdef CALC_COULOMB
 401     /* Load parameters for j atom */
 402     jq_S        = load<SimdReal>(q+aj);
 403     qq_S0       = iq_S0 * jq_S;
 404     qq_S1       = iq_S1 * jq_S;
 405     qq_S2       = iq_S2 * jq_S;
 406     qq_S3       = iq_S3 * jq_S;
 407 #endif
 408
 409 #ifdef CALC_LJ
 410 #if !defined LJ_COMB_GEOM && !defined LJ_COMB_LB && !defined FIX_LJ_C
 411     SimdReal c6_S0, c6_S1, c12_S0, c12_S1;
 412     gatherLoadTranspose<c_simdBestPairAlignment>(nbfp0, type+aj, &c6_S0, &c12_S0);
 413     gatherLoadTranspose<c_simdBestPairAlignment>(nbfp1, type+aj, &c6_S1, &c12_S1);
 414 #ifndef HALF_LJ
 415     SimdReal c6_S2, c6_S3, c12_S2, c12_S3;
 416     gatherLoadTranspose<c_simdBestPairAlignment>(nbfp2, type+aj, &c6_S2, &c12_S2);
 417     gatherLoadTranspose<c_simdBestPairAlignment>(nbfp3, type+aj, &c6_S3, &c12_S3);
 418 #endif
 419 #endif /* not defined any LJ rule */
 420
 421 #ifdef LJ_COMB_GEOM
 422     c6s_j_S     = load<SimdReal>(ljc+aj2+0);
 423     c12s_j_S    = load<SimdReal>(ljc+aj2+STRIDE);
 424     SimdReal c6_S0  = c6s_S0 * c6s_j_S;
 425     SimdReal c6_S1  = c6s_S1 * c6s_j_S;
 426 #ifndef HALF_LJ
 427     SimdReal c6_S2  = c6s_S2 * c6s_j_S;
 428     SimdReal c6_S3  = c6s_S3 * c6s_j_S;
 429 #endif
 430     SimdReal c12_S0 = c12s_S0 * c12s_j_S;
 431     SimdReal c12_S1 = c12s_S1 * c12s_j_S;
 432 #ifndef HALF_LJ
 433     SimdReal c12_S2 = c12s_S2 * c12s_j_S;
 434     SimdReal c12_S3 = c12s_S3 * c12s_j_S;
 435 #endif
 436 #endif /* LJ_COMB_GEOM */
 437
 438 #ifdef LJ_COMB_LB
 439     hsig_j_S    = load<SimdReal>(ljc+aj2+0);
 440     seps_j_S    = load<SimdReal>(ljc+aj2+STRIDE);
 441
 442     sig_S0      = hsig_i_S0 + hsig_j_S;
 443     sig_S1      = hsig_i_S1 + hsig_j_S;
 444     eps_S0      = seps_i_S0 * seps_j_S;
 445     eps_S1      = seps_i_S1 * seps_j_S;
 446 #ifndef HALF_LJ
 447     sig_S2      = hsig_i_S2 + hsig_j_S;
 448     sig_S3      = hsig_i_S3 + hsig_j_S;
 449     eps_S2      = seps_i_S2 * seps_j_S;
 450     eps_S3      = seps_i_S3 * seps_j_S;
 451 #endif
 452 #endif /* LJ_COMB_LB */
 453
 454 #endif /* CALC_LJ */
 455
 456     /* Set rinv to zero for r beyond the cut-off */
 457     rinv_S0     = selectByMask(rinv_S0, wco_S0);
 458     rinv_S1     = selectByMask(rinv_S1, wco_S1);
 459     rinv_S2     = selectByMask(rinv_S2, wco_S2);
 460     rinv_S3     = selectByMask(rinv_S3, wco_S3);
 461
 462     rinvsq_S0   = rinv_S0 * rinv_S0;
 463     rinvsq_S1   = rinv_S1 * rinv_S1;
 464     rinvsq_S2   = rinv_S2 * rinv_S2;
 465     rinvsq_S3   = rinv_S3 * rinv_S3;
 466
 467 #ifdef CALC_COULOMB
 468     /* Note that here we calculate force*r, not the usual force/r.
 469      * This allows avoiding masking the reaction-field contribution,
 470      * as frcoul is later multiplied by rinvsq which has been
 471      * masked with the cut-off check.
 472      */
 473
 474 #ifdef EXCL_FORCES
 475     /* Only add 1/r for non-excluded atom pairs */
 476     rinv_ex_S0  = selectByMask(rinv_S0, interact_S0);
 477     rinv_ex_S1  = selectByMask(rinv_S1, interact_S1);
 478     rinv_ex_S2  = selectByMask(rinv_S2, interact_S2);
 479     rinv_ex_S3  = selectByMask(rinv_S3, interact_S3);
 480 #else
 481     /* No exclusion forces, we always need 1/r */
 482 #define     rinv_ex_S0    rinv_S0
 483 #define     rinv_ex_S1    rinv_S1
 484 #define     rinv_ex_S2    rinv_S2
 485 #define     rinv_ex_S3    rinv_S3
 486 #endif
 487
 488 #ifdef CALC_COUL_RF
 489     /* Electrostatic interactions */
 490     frcoul_S0   = qq_S0 * fma(rsq_S0, mrc_3_S, rinv_ex_S0);
 491     frcoul_S1   = qq_S1 * fma(rsq_S1, mrc_3_S, rinv_ex_S1);
 492     frcoul_S2   = qq_S2 * fma(rsq_S2, mrc_3_S, rinv_ex_S2);
 493     frcoul_S3   = qq_S3 * fma(rsq_S3, mrc_3_S, rinv_ex_S3);
 494
 495 #ifdef CALC_ENERGIES
 496     vcoul_S0    = qq_S0 * (rinv_ex_S0 + fma(rsq_S0, hrc_3_S, moh_rc_S));
 497     vcoul_S1    = qq_S1 * (rinv_ex_S1 + fma(rsq_S1, hrc_3_S, moh_rc_S));
 498     vcoul_S2    = qq_S2 * (rinv_ex_S2 + fma(rsq_S2, hrc_3_S, moh_rc_S));
 499     vcoul_S3    = qq_S3 * (rinv_ex_S3 + fma(rsq_S3, hrc_3_S, moh_rc_S));
 500 #endif
 501 #endif
 502
 503 #ifdef CALC_COUL_EWALD
 504     /* We need to mask (or limit) rsq for the cut-off,
 505      * as large distances can cause an overflow in gmx_pmecorrF/V.
 506      */
 507     brsq_S0     = beta2_S * selectByMask(rsq_S0, wco_S0);
 508     brsq_S1     = beta2_S * selectByMask(rsq_S1, wco_S1);
 509     brsq_S2     = beta2_S * selectByMask(rsq_S2, wco_S2);
 510     brsq_S3     = beta2_S * selectByMask(rsq_S3, wco_S3);
 511     ewcorr_S0   = beta_S * pmeForceCorrection(brsq_S0);
 512     ewcorr_S1   = beta_S * pmeForceCorrection(brsq_S1);
 513     ewcorr_S2   = beta_S * pmeForceCorrection(brsq_S2);
 514     ewcorr_S3   = beta_S * pmeForceCorrection(brsq_S3);
 515     frcoul_S0   = qq_S0 * fma(ewcorr_S0, brsq_S0, rinv_ex_S0);
 516     frcoul_S1   = qq_S1 * fma(ewcorr_S1, brsq_S1, rinv_ex_S1);
 517     frcoul_S2   = qq_S2 * fma(ewcorr_S2, brsq_S2, rinv_ex_S2);
 518     frcoul_S3   = qq_S3 * fma(ewcorr_S3, brsq_S3, rinv_ex_S3);
 519
 520 #ifdef CALC_ENERGIES
 521     vc_sub_S0   = beta_S * pmePotentialCorrection(brsq_S0);
 522     vc_sub_S1   = beta_S * pmePotentialCorrection(brsq_S1);
 523     vc_sub_S2   = beta_S * pmePotentialCorrection(brsq_S2);
 524     vc_sub_S3   = beta_S * pmePotentialCorrection(brsq_S3);
 525 #endif
 526
 527 #endif /* CALC_COUL_EWALD */
 528
 529 #ifdef CALC_COUL_TAB
 530     /* Electrostatic interactions */
 531     r_S0        = rsq_S0 * rinv_S0;
 532     r_S1        = rsq_S1 * rinv_S1;
 533     r_S2        = rsq_S2 * rinv_S2;
 534     r_S3        = rsq_S3 * rinv_S3;
 535     /* Convert r to scaled table units */
 536     rs_S0       = r_S0 * invtsp_S;
 537     rs_S1       = r_S1 * invtsp_S;
 538     rs_S2       = r_S2 * invtsp_S;
 539     rs_S3       = r_S3 * invtsp_S;
 540     /* Truncate scaled r to an int */
 541     ti_S0       = cvttR2I(rs_S0);
 542     ti_S1       = cvttR2I(rs_S1);
 543     ti_S2       = cvttR2I(rs_S2);
 544     ti_S3       = cvttR2I(rs_S3);
 545
 546     rf_S0       = trunc(rs_S0);
 547     rf_S1       = trunc(rs_S1);
 548     rf_S2       = trunc(rs_S2);
 549     rf_S3       = trunc(rs_S3);
 550
 551     frac_S0     = rs_S0 - rf_S0;
 552     frac_S1     = rs_S1 - rf_S1;
 553     frac_S2     = rs_S2 - rf_S2;
 554     frac_S3     = rs_S3 - rf_S3;
 555
 556     /* Load and interpolate table forces and possibly energies.
 557      * Force and energy can be combined in one table, stride 4: FDV0
 558      * or in two separate tables with stride 1: F and V
 559      * Currently single precision uses FDV0, double F and V.
 560      */
 561 #ifndef CALC_ENERGIES
 562 #ifdef TAB_FDV0
 563     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
 564     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S1, &ctab0_S1, &ctab1_S1);
 565     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
 566     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S3, &ctab0_S3, &ctab1_S3);
 567 #else
 568     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
 569     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S1, &ctab0_S1, &ctab1_S1);
 570     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
 571     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S3, &ctab0_S3, &ctab1_S3);
 572     ctab1_S0 = ctab1_S0 - ctab0_S0;
 573     ctab1_S1 = ctab1_S1 - ctab0_S1;
 574     ctab1_S2 = ctab1_S2 - ctab0_S2;
 575     ctab1_S3 = ctab1_S3 - ctab0_S3;
 576 #endif
 577 #else
 578 #ifdef TAB_FDV0
 579     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0, &ctabv_S0, &dum_S0);
 580     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S1, &ctab0_S1, &ctab1_S1, &ctabv_S1, &dum_S1);
 581     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2, &ctabv_S2, &dum_S2);
 582     gatherLoadBySimdIntTranspose<4>(tab_coul_F, ti_S3, &ctab0_S3, &ctab1_S3, &ctabv_S3, &dum_S3);
 583 #else
 584     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S0, &ctab0_S0, &ctab1_S0);
 585     gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S0, &ctabv_S0, &dum_S0);
 586     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S1, &ctab0_S1, &ctab1_S1);
 587     gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S1, &ctabv_S1, &dum_S1);
 588     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S2, &ctab0_S2, &ctab1_S2);
 589     gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S2, &ctabv_S2, &dum_S2);
 590     gatherLoadUBySimdIntTranspose<1>(tab_coul_F, ti_S3, &ctab0_S3, &ctab1_S3);
 591     gatherLoadUBySimdIntTranspose<1>(tab_coul_V, ti_S3, &ctabv_S3, &dum_S3);
 592     ctab1_S0 = ctab1_S0 - ctab0_S0;
 593     ctab1_S1 = ctab1_S1 - ctab0_S1;
 594     ctab1_S2 = ctab1_S2 - ctab0_S2;
 595     ctab1_S3 = ctab1_S3 - ctab0_S3;
 596 #endif
 597 #endif
 598     fsub_S0     = fma(frac_S0, ctab1_S0, ctab0_S0);
 599     fsub_S1     = fma(frac_S1, ctab1_S1, ctab0_S1);
 600     fsub_S2     = fma(frac_S2, ctab1_S2, ctab0_S2);
 601     fsub_S3     = fma(frac_S3, ctab1_S3, ctab0_S3);
 602     frcoul_S0   = qq_S0 * fnma(fsub_S0, r_S0, rinv_ex_S0);
 603     frcoul_S1   = qq_S1 * fnma(fsub_S1, r_S1, rinv_ex_S1);
 604     frcoul_S2   = qq_S2 * fnma(fsub_S2, r_S2, rinv_ex_S2);
 605     frcoul_S3   = qq_S3 * fnma(fsub_S3, r_S3, rinv_ex_S3);
 606
 607 #ifdef CALC_ENERGIES
 608     vc_sub_S0   = fma((mhalfsp_S * frac_S0), (ctab0_S0 + fsub_S0), ctabv_S0);
 609     vc_sub_S1   = fma((mhalfsp_S * frac_S1), (ctab0_S1 + fsub_S1), ctabv_S1);
 610     vc_sub_S2   = fma((mhalfsp_S * frac_S2), (ctab0_S2 + fsub_S2), ctabv_S2);
 611     vc_sub_S3   = fma((mhalfsp_S * frac_S3), (ctab0_S3 + fsub_S3), ctabv_S3);
 612 #endif
 613 #endif /* CALC_COUL_TAB */
 614
 615 #if defined CALC_ENERGIES && (defined CALC_COUL_EWALD || defined CALC_COUL_TAB)
 616 #ifndef NO_SHIFT_EWALD
 617     /* Add Ewald potential shift to vc_sub for convenience */
 618 #ifdef CHECK_EXCLS
 619     vc_sub_S0   = vc_sub_S0 + selectByMask(sh_ewald_S, interact_S0);
 620     vc_sub_S1   = vc_sub_S1 + selectByMask(sh_ewald_S, interact_S1);
 621     vc_sub_S2   = vc_sub_S2 + selectByMask(sh_ewald_S, interact_S2);
 622     vc_sub_S3   = vc_sub_S3 + selectByMask(sh_ewald_S, interact_S3);
 623 #else
 624     vc_sub_S0   = vc_sub_S0 + sh_ewald_S;
 625     vc_sub_S1   = vc_sub_S1 + sh_ewald_S;
 626     vc_sub_S2   = vc_sub_S2 + sh_ewald_S;
 627     vc_sub_S3   = vc_sub_S3 + sh_ewald_S;
 628 #endif
 629 #endif
 630
 631     vcoul_S0    = qq_S0 * (rinv_ex_S0 - vc_sub_S0);
 632     vcoul_S1    = qq_S1 * (rinv_ex_S1 - vc_sub_S1);
 633     vcoul_S2    = qq_S2 * (rinv_ex_S2 - vc_sub_S2);
 634     vcoul_S3    = qq_S3 * (rinv_ex_S3 - vc_sub_S3);
 635
 636 #endif
 637
 638 #ifdef CALC_ENERGIES
 639     /* Mask energy for cut-off and diagonal */
 640     vcoul_S0    = selectByMask(vcoul_S0, wco_S0);
 641     vcoul_S1    = selectByMask(vcoul_S1, wco_S1);
 642     vcoul_S2    = selectByMask(vcoul_S2, wco_S2);
 643     vcoul_S3    = selectByMask(vcoul_S3, wco_S3);
 644 #endif
 645
 646 #endif /* CALC_COULOMB */
 647
 648 #ifdef CALC_LJ
 649     /* Lennard-Jones interaction */
 650
 651 #ifdef VDW_CUTOFF_CHECK
 652     wco_vdw_S0  = (rsq_S0 < rcvdw2_S);
 653     wco_vdw_S1  = (rsq_S1 < rcvdw2_S);
 654 #ifndef HALF_LJ
 655     wco_vdw_S2  = (rsq_S2 < rcvdw2_S);
 656     wco_vdw_S3  = (rsq_S3 < rcvdw2_S);
 657 #endif
 658 #else
 659     /* Same cut-off for Coulomb and VdW, reuse the registers */
 660 #define     wco_vdw_S0    wco_S0
 661 #define     wco_vdw_S1    wco_S1
 662 #define     wco_vdw_S2    wco_S2
 663 #define     wco_vdw_S3    wco_S3
 664 #endif
 665
 666 #ifndef LJ_COMB_LB
 667     rinvsix_S0  = rinvsq_S0 * rinvsq_S0 * rinvsq_S0;
 668     rinvsix_S1  = rinvsq_S1 * rinvsq_S1 * rinvsq_S1;
 669 #ifdef EXCL_FORCES
 670     rinvsix_S0  = selectByMask(rinvsix_S0, interact_S0);
 671     rinvsix_S1  = selectByMask(rinvsix_S1, interact_S1);
 672 #endif
 673 #ifndef HALF_LJ
 674     rinvsix_S2  = rinvsq_S2 * rinvsq_S2 * rinvsq_S2;
 675     rinvsix_S3  = rinvsq_S3 * rinvsq_S3 * rinvsq_S3;
 676 #ifdef EXCL_FORCES
 677     rinvsix_S2  = selectByMask(rinvsix_S2, interact_S2);
 678     rinvsix_S3  = selectByMask(rinvsix_S3, interact_S3);
 679 #endif
 680 #endif
 681
 682 #if defined LJ_CUT || defined LJ_POT_SWITCH
 683     /* We have plain LJ or LJ-PME with simple C6/6 C12/12 coefficients */
 684     FrLJ6_S0    = c6_S0 * rinvsix_S0;
 685     FrLJ6_S1    = c6_S1 * rinvsix_S1;
 686 #ifndef HALF_LJ
 687     FrLJ6_S2    = c6_S2 * rinvsix_S2;
 688     FrLJ6_S3    = c6_S3 * rinvsix_S3;
 689 #endif
 690     FrLJ12_S0   = c12_S0 * rinvsix_S0 * rinvsix_S0;
 691     FrLJ12_S1   = c12_S1 * rinvsix_S1 * rinvsix_S1;
 692 #ifndef HALF_LJ
 693     FrLJ12_S2   = c12_S2 * rinvsix_S2 * rinvsix_S2;
 694     FrLJ12_S3   = c12_S3 * rinvsix_S3 * rinvsix_S3;
 695 #endif
 696 #endif
 697
 698 #if defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
 699     /* We switch the LJ force */
 700     r_S0        = rsq_S0 * rinv_S0;
 701     rsw_S0      = max(r_S0 - rswitch_S, zero_S);
 702     rsw2_S0     = rsw_S0 * rsw_S0;
 703     r_S1        = rsq_S1 * rinv_S1;
 704     rsw_S1      = max(r_S1 - rswitch_S, zero_S);
 705     rsw2_S1     = rsw_S1 * rsw_S1;
 706 #ifndef HALF_LJ
 707     r_S2        = rsq_S2 * rinv_S2;
 708     rsw_S2      = max(r_S2 - rswitch_S, zero_S);
 709     rsw2_S2     = rsw_S2 * rsw_S2;
 710     r_S3        = rsq_S3 * rinv_S3;
 711     rsw_S3      = max(r_S3 - rswitch_S, zero_S);
 712     rsw2_S3     = rsw_S3 * rsw_S3;
 713 #endif
 714 #endif
 715
 716 #ifdef LJ_FORCE_SWITCH
 717
 718 #define gmx_add_fr_switch(fr, rsw, rsw2_r, c2, c3) fma(fma(c3, rsw, c2), rsw2_r, (fr))
 719     SimdReal rsw2_r_S0 = rsw2_S0 * r_S0;
 720     SimdReal rsw2_r_S1 = rsw2_S1 * r_S1;
 721     FrLJ6_S0    = c6_S0 * gmx_add_fr_switch(rinvsix_S0, rsw_S0, rsw2_r_S0, p6_fc2_S, p6_fc3_S);
 722     FrLJ6_S1    = c6_S1 * gmx_add_fr_switch(rinvsix_S1, rsw_S1, rsw2_r_S1, p6_fc2_S, p6_fc3_S);
 723 #ifndef HALF_LJ
 724     SimdReal rsw2_r_S2 = rsw2_S2 * r_S2;
 725     SimdReal rsw2_r_S3 = rsw2_S3 * r_S3;
 726     FrLJ6_S2    = c6_S2 * gmx_add_fr_switch(rinvsix_S2, rsw_S2, rsw2_r_S2, p6_fc2_S, p6_fc3_S);
 727     FrLJ6_S3    = c6_S3 * gmx_add_fr_switch(rinvsix_S3, rsw_S3, rsw2_r_S3, p6_fc2_S, p6_fc3_S);
 728 #endif
 729     FrLJ12_S0   = c12_S0 * gmx_add_fr_switch((rinvsix_S0 * rinvsix_S0), rsw_S0, rsw2_r_S0, p12_fc2_S, p12_fc3_S);
 730     FrLJ12_S1   = c12_S1 * gmx_add_fr_switch((rinvsix_S1 * rinvsix_S1), rsw_S1, rsw2_r_S1, p12_fc2_S, p12_fc3_S);
 731 #ifndef HALF_LJ
 732     FrLJ12_S2   = c12_S2 * gmx_add_fr_switch((rinvsix_S2 * rinvsix_S2), rsw_S2, rsw2_r_S2, p12_fc2_S, p12_fc3_S);
 733     FrLJ12_S3   = c12_S3 * gmx_add_fr_switch((rinvsix_S3 * rinvsix_S3), rsw_S3, rsw2_r_S3, p12_fc2_S, p12_fc3_S);
 734 #endif
 735 #undef gmx_add_fr_switch
 736 #endif /* LJ_FORCE_SWITCH */
 737
 738 #endif /* not LJ_COMB_LB */
 739
 740 #ifdef LJ_COMB_LB
 741     sir_S0      = sig_S0 * rinv_S0;
 742     sir_S1      = sig_S1 * rinv_S1;
 743 #ifndef HALF_LJ
 744     sir_S2      = sig_S2 * rinv_S2;
 745     sir_S3      = sig_S3 * rinv_S3;
 746 #endif
 747     sir2_S0     = sir_S0 * sir_S0;
 748     sir2_S1     = sir_S1 * sir_S1;
 749 #ifndef HALF_LJ
 750     sir2_S2     = sir_S2 * sir_S2;
 751     sir2_S3     = sir_S3 * sir_S3;
 752 #endif
 753     sir6_S0     = sir2_S0 * sir2_S0 * sir2_S0;
 754     sir6_S1     = sir2_S1 * sir2_S1 * sir2_S1;
 755 #ifdef EXCL_FORCES
 756     sir6_S0     = selectByMask(sir6_S0, interact_S0);
 757     sir6_S1     = selectByMask(sir6_S1, interact_S1);
 758 #endif
 759 #ifndef HALF_LJ
 760     sir6_S2     = sir2_S2 * sir2_S2 * sir2_S2;
 761     sir6_S3     = sir2_S3 * sir2_S3 * sir2_S3;
 762 #ifdef EXCL_FORCES
 763     sir6_S2     = selectByMask(sir6_S2, interact_S2);
 764     sir6_S3     = selectByMask(sir6_S3, interact_S3);
 765 #endif
 766 #endif
 767 #ifdef VDW_CUTOFF_CHECK
 768     sir6_S0     = selectByMask(sir6_S0, wco_vdw_S0);
 769     sir6_S1     = selectByMask(sir6_S1, wco_vdw_S1);
 770 #ifndef HALF_LJ
 771     sir6_S2     = selectByMask(sir6_S2, wco_vdw_S2);
 772     sir6_S3     = selectByMask(sir6_S3, wco_vdw_S3);
 773 #endif
 774 #endif
 775     FrLJ6_S0    = eps_S0 * sir6_S0;
 776     FrLJ6_S1    = eps_S1 * sir6_S1;
 777 #ifndef HALF_LJ
 778     FrLJ6_S2    = eps_S2 * sir6_S2;
 779     FrLJ6_S3    = eps_S3 * sir6_S3;
 780 #endif
 781     FrLJ12_S0   = FrLJ6_S0 * sir6_S0;
 782     FrLJ12_S1   = FrLJ6_S1 * sir6_S1;
 783 #ifndef HALF_LJ
 784     FrLJ12_S2   = FrLJ6_S2 * sir6_S2;
 785     FrLJ12_S3   = FrLJ6_S3 * sir6_S3;
 786 #endif
 787 #if defined CALC_ENERGIES
 788     /* We need C6 and C12 to calculate the LJ potential shift */
 789     sig2_S0     = sig_S0 * sig_S0;
 790     sig2_S1     = sig_S1 * sig_S1;
 791 #ifndef HALF_LJ
 792     sig2_S2     = sig_S2 * sig_S2;
 793     sig2_S3     = sig_S3 * sig_S3;
 794 #endif
 795     sig6_S0     = sig2_S0 * sig2_S0 * sig2_S0;
 796     sig6_S1     = sig2_S1 * sig2_S1 * sig2_S1;
 797 #ifndef HALF_LJ
 798     sig6_S2     = sig2_S2 * sig2_S2 * sig2_S2;
 799     sig6_S3     = sig2_S3 * sig2_S3 * sig2_S3;
 800 #endif
 801     SimdReal c6_S0  = eps_S0 * sig6_S0;
 802     SimdReal c6_S1  = eps_S1 * sig6_S1;
 803 #ifndef HALF_LJ
 804     SimdReal c6_S2  = eps_S2 * sig6_S2;
 805     SimdReal c6_S3  = eps_S3 * sig6_S3;
 806 #endif
 807     SimdReal c12_S0 = c6_S0 * sig6_S0;
 808     SimdReal c12_S1 = c6_S1 * sig6_S1;
 809 #ifndef HALF_LJ
 810     SimdReal c12_S2 = c6_S2 * sig6_S2;
 811     SimdReal c12_S3 = c6_S3 * sig6_S3;
 812 #endif
 813 #endif
 814 #endif /* LJ_COMB_LB */
 815
 816     /* Determine the total scalar LJ force*r */
 817     frLJ_S0     = FrLJ12_S0 - FrLJ6_S0;
 818     frLJ_S1     = FrLJ12_S1 - FrLJ6_S1;
 819 #ifndef HALF_LJ
 820     frLJ_S2     = FrLJ12_S2 - FrLJ6_S2;
 821     frLJ_S3     = FrLJ12_S3 - FrLJ6_S3;
 822 #endif
 823
 824 #if (defined LJ_CUT || defined LJ_FORCE_SWITCH) && defined CALC_ENERGIES
 825
 826 #ifdef LJ_CUT
 827     /* Calculate the LJ energies, with constant potential shift */
 828     SimdReal VLJ6_S0  = sixth_S * fma(c6_S0, p6_cpot_S, FrLJ6_S0);
 829     SimdReal VLJ6_S1  = sixth_S * fma(c6_S1, p6_cpot_S, FrLJ6_S1);
 830 #ifndef HALF_LJ
 831     SimdReal VLJ6_S2  = sixth_S * fma(c6_S2, p6_cpot_S, FrLJ6_S2);
 832     SimdReal VLJ6_S3  = sixth_S * fma(c6_S3, p6_cpot_S, FrLJ6_S3);
 833 #endif
 834     SimdReal VLJ12_S0 = twelveth_S * fma(c12_S0, p12_cpot_S, FrLJ12_S0);
 835     SimdReal VLJ12_S1 = twelveth_S * fma(c12_S1, p12_cpot_S, FrLJ12_S1);
 836 #ifndef HALF_LJ
 837     SimdReal VLJ12_S2 = twelveth_S * fma(c12_S2, p12_cpot_S, FrLJ12_S2);
 838     SimdReal VLJ12_S3 = twelveth_S * fma(c12_S3, p12_cpot_S, FrLJ12_S3);
 839 #endif
 840 #endif /* LJ_CUT */
 841
 842 #ifdef LJ_FORCE_SWITCH
 843 #define v_fswitch_r(rsw, rsw2, c0, c3, c4) fma(fma((c4), (rsw), (c3)), ((rsw2) * (rsw)), (c0))
 844
 845     SimdReal VLJ6_S0  = c6_S0 * fma(sixth_S, rinvsix_S0, v_fswitch_r(rsw_S0, rsw2_S0, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
 846     SimdReal VLJ6_S1  = c6_S1 * fma(sixth_S, rinvsix_S1, v_fswitch_r(rsw_S1, rsw2_S1, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
 847 #ifndef HALF_LJ
 848     SimdReal VLJ6_S2  = c6_S2 * fma(sixth_S, rinvsix_S2, v_fswitch_r(rsw_S2, rsw2_S2, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
 849     SimdReal VLJ6_S3  = c6_S3 * fma(sixth_S, rinvsix_S3, v_fswitch_r(rsw_S3, rsw2_S3, p6_6cpot_S, p6_vc3_S, p6_vc4_S));
 850 #endif
 851     SimdReal VLJ12_S0 = c12_S0 * fma(twelveth_S, rinvsix_S0 * rinvsix_S0, v_fswitch_r(rsw_S0, rsw2_S0, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
 852     SimdReal VLJ12_S1 = c12_S1 * fma(twelveth_S, rinvsix_S1 * rinvsix_S1, v_fswitch_r(rsw_S1, rsw2_S1, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
 853 #ifndef HALF_LJ
 854     SimdReal VLJ12_S2 = c12_S2 * fma(twelveth_S, rinvsix_S2 * rinvsix_S2, v_fswitch_r(rsw_S2, rsw2_S2, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
 855     SimdReal VLJ12_S3 = c12_S3 * fma(twelveth_S, rinvsix_S3 * rinvsix_S3, v_fswitch_r(rsw_S3, rsw2_S3, p12_12cpot_S, p12_vc3_S, p12_vc4_S));
 856 #endif
 857 #undef v_fswitch_r
 858 #endif /* LJ_FORCE_SWITCH */
 859
 860     /* Add up the repulsion and dispersion */
 861     SimdReal VLJ_S0 = VLJ12_S0 - VLJ6_S0;
 862     SimdReal VLJ_S1 = VLJ12_S1 - VLJ6_S1;
 863 #ifndef HALF_LJ
 864     SimdReal VLJ_S2 = VLJ12_S2 - VLJ6_S2;
 865     SimdReal VLJ_S3 = VLJ12_S3 - VLJ6_S3;
 866 #endif
 867
 868 #endif /* (LJ_CUT || LJ_FORCE_SWITCH) && CALC_ENERGIES */
 869
 870 #ifdef LJ_POT_SWITCH
 871     /* We always need the potential, since it is needed for the force */
 872     SimdReal VLJ_S0 = fnma(sixth_S, FrLJ6_S0, twelveth_S * FrLJ12_S0);
 873     SimdReal VLJ_S1 = fnma(sixth_S, FrLJ6_S1, twelveth_S * FrLJ12_S1);
 874 #ifndef HALF_LJ
 875     SimdReal VLJ_S2 = fnma(sixth_S, FrLJ6_S2, twelveth_S * FrLJ12_S2);
 876     SimdReal VLJ_S3 = fnma(sixth_S, FrLJ6_S3, twelveth_S * FrLJ12_S3);
 877 #endif
 878
 879     {
 880         SimdReal sw_S0, dsw_S0;
 881         SimdReal sw_S1, dsw_S1;
 882 #ifndef HALF_LJ
 883         SimdReal sw_S2, dsw_S2;
 884         SimdReal sw_S3, dsw_S3;
 885 #endif
 886
 887 #define switch_r(rsw, rsw2, c3, c4, c5) fma(fma(fma(c5, rsw, c4), rsw, c3), ((rsw2) * (rsw)), one_S)
 888 #define dswitch_r(rsw, rsw2, c2, c3, c4) (fma(fma(c4, rsw, c3), rsw, c2) * (rsw2))
 889
 890         sw_S0  = switch_r(rsw_S0, rsw2_S0, swV3_S, swV4_S, swV5_S);
 891         dsw_S0 = dswitch_r(rsw_S0, rsw2_S0, swF2_S, swF3_S, swF4_S);
 892         sw_S1  = switch_r(rsw_S1, rsw2_S1, swV3_S, swV4_S, swV5_S);
 893         dsw_S1 = dswitch_r(rsw_S1, rsw2_S1, swF2_S, swF3_S, swF4_S);
 894 #ifndef HALF_LJ
 895         sw_S2  = switch_r(rsw_S2, rsw2_S2, swV3_S, swV4_S, swV5_S);
 896         dsw_S2 = dswitch_r(rsw_S2, rsw2_S2, swF2_S, swF3_S, swF4_S);
 897         sw_S3  = switch_r(rsw_S3, rsw2_S3, swV3_S, swV4_S, swV5_S);
 898         dsw_S3 = dswitch_r(rsw_S3, rsw2_S3, swF2_S, swF3_S, swF4_S);
 899 #endif
 900         frLJ_S0 = fnma(dsw_S0 * VLJ_S0, r_S0, sw_S0 * frLJ_S0);
 901         frLJ_S1 = fnma(dsw_S1 * VLJ_S1, r_S1, sw_S1 * frLJ_S1);
 902 #ifndef HALF_LJ
 903         frLJ_S2 = fnma(dsw_S2 * VLJ_S2, r_S2, sw_S2 * frLJ_S2);
 904         frLJ_S3 = fnma(dsw_S3 * VLJ_S3, r_S3, sw_S3 * frLJ_S3);
 905 #endif
 906 #ifdef CALC_ENERGIES
 907         VLJ_S0  = sw_S0 * VLJ_S0;
 908         VLJ_S1  = sw_S1 * VLJ_S1;
 909 #ifndef HALF_LJ
 910         VLJ_S2  = sw_S2 * VLJ_S2;
 911         VLJ_S3  = sw_S3 * VLJ_S3;
 912 #endif
 913 #endif
 914
 915 #undef switch_r
 916 #undef dswitch_r
 917     }
 918 #endif /* LJ_POT_SWITCH */
 919
 920 #if defined CALC_ENERGIES && defined CHECK_EXCLS
 921     /* The potential shift should be removed for excluded pairs */
 922     VLJ_S0      = selectByMask(VLJ_S0, interact_S0);
 923     VLJ_S1      = selectByMask(VLJ_S1, interact_S1);
 924 #ifndef HALF_LJ
 925     VLJ_S2      = selectByMask(VLJ_S2, interact_S2);
 926     VLJ_S3      = selectByMask(VLJ_S3, interact_S3);
 927 #endif
 928 #endif
 929
 930 #ifdef LJ_EWALD_GEOM
 931     {
 932         SimdReal c6s_j_S;
 933         SimdReal c6grid_S0, rinvsix_nm_S0, cr2_S0, expmcr2_S0, poly_S0;
 934         SimdReal c6grid_S1, rinvsix_nm_S1, cr2_S1, expmcr2_S1, poly_S1;
 935 #ifndef HALF_LJ
 936         SimdReal c6grid_S2, rinvsix_nm_S2, cr2_S2, expmcr2_S2, poly_S2;
 937         SimdReal c6grid_S3, rinvsix_nm_S3, cr2_S3, expmcr2_S3, poly_S3;
 938 #endif
 939 #ifdef CALC_ENERGIES
 940         SimdReal sh_mask_S0;
 941         SimdReal sh_mask_S1;
 942 #ifndef HALF_LJ
 943         SimdReal sh_mask_S2;
 944         SimdReal sh_mask_S3;
 945 #endif
 946 #endif
 947
 948         /* Determine C6 for the grid using the geometric combination rule */
 949         c6s_j_S         = load<SimdReal>(ljc+aj2+0);
 950         c6grid_S0       = c6s_S0 * c6s_j_S;
 951         c6grid_S1       = c6s_S1 * c6s_j_S;
 952 #ifndef HALF_LJ
 953         c6grid_S2       = c6s_S2 * c6s_j_S;
 954         c6grid_S3       = c6s_S3 * c6s_j_S;
 955 #endif
 956
 957 #ifdef CHECK_EXCLS
 958         /* Recalculate rinvsix without exclusion mask (compiler might optimize) */
 959         rinvsix_nm_S0 = rinvsq_S0 * rinvsq_S0 * rinvsq_S0;
 960         rinvsix_nm_S1 = rinvsq_S1 * rinvsq_S1 * rinvsq_S1;
 961 #ifndef HALF_LJ
 962         rinvsix_nm_S2 = rinvsq_S2 * rinvsq_S2 * rinvsq_S2;
 963         rinvsix_nm_S3 = rinvsq_S3 * rinvsq_S3 * rinvsq_S3;
 964 #endif
 965 #else
 966         /* We didn't use a mask, so we can copy */
 967         rinvsix_nm_S0 = rinvsix_S0;
 968         rinvsix_nm_S1 = rinvsix_S1;
 969 #ifndef HALF_LJ
 970         rinvsix_nm_S2 = rinvsix_S2;
 971         rinvsix_nm_S3 = rinvsix_S3;
 972 #endif
 973 #endif
 974
 975         /* Mask for the cut-off to avoid overflow of cr2^2 */
 976         cr2_S0        = lje_c2_S * selectByMask(rsq_S0, wco_vdw_S0);
 977         cr2_S1        = lje_c2_S * selectByMask(rsq_S1, wco_vdw_S1);
 978 #ifndef HALF_LJ
 979         cr2_S2        = lje_c2_S * selectByMask(rsq_S2, wco_vdw_S2);
 980         cr2_S3        = lje_c2_S * selectByMask(rsq_S3, wco_vdw_S3);
 981 #endif
 982         // Unsafe version of our exp() should be fine, since these arguments should never
 983         // be smaller than -127 for any reasonable choice of cutoff or ewald coefficients.
 984         expmcr2_S0    = exp<MathOptimization::Unsafe>( -cr2_S0 );
 985         expmcr2_S1    = exp<MathOptimization::Unsafe>( -cr2_S1 );
 986 #ifndef HALF_LJ
 987         expmcr2_S2    = exp<MathOptimization::Unsafe>( -cr2_S2 );
 988         expmcr2_S3    = exp<MathOptimization::Unsafe>( -cr2_S3 );
 989 #endif
 990
 991         /* 1 + cr2 + 1/2*cr2^2 */
 992         poly_S0       = fma(fma(half_S, cr2_S0, one_S), cr2_S0, one_S);
 993         poly_S1       = fma(fma(half_S, cr2_S1, one_S), cr2_S1, one_S);
 994 #ifndef HALF_LJ
 995         poly_S2       = fma(fma(half_S, cr2_S2, one_S), cr2_S2, one_S);
 996         poly_S3       = fma(fma(half_S, cr2_S3, one_S), cr2_S3, one_S);
 997 #endif
 998
 999         /* We calculate LJ F*r = (6*C6)*(r^-6 - F_mesh/6), we use:
1000          * r^-6*cexp*(1 + cr2 + cr2^2/2 + cr2^3/6) = cexp*(r^-6*poly + c^6/6)
1001          */
1002         frLJ_S0       = fma(c6grid_S0, fnma(expmcr2_S0, fma(rinvsix_nm_S0, poly_S0, lje_c6_6_S), rinvsix_nm_S0), frLJ_S0);
1003         frLJ_S1       = fma(c6grid_S1, fnma(expmcr2_S1, fma(rinvsix_nm_S1, poly_S1, lje_c6_6_S), rinvsix_nm_S1), frLJ_S1);
1004 #ifndef HALF_LJ
1005         frLJ_S2       = fma(c6grid_S2, fnma(expmcr2_S2, fma(rinvsix_nm_S2, poly_S2, lje_c6_6_S), rinvsix_nm_S2), frLJ_S2);
1006         frLJ_S3       = fma(c6grid_S3, fnma(expmcr2_S3, fma(rinvsix_nm_S3, poly_S3, lje_c6_6_S), rinvsix_nm_S3), frLJ_S3);
1007 #endif
1008
1009 #ifdef CALC_ENERGIES
1010 #ifdef CHECK_EXCLS
1011         sh_mask_S0    = selectByMask(lje_vc_S, interact_S0);
1012         sh_mask_S1    = selectByMask(lje_vc_S, interact_S1);
1013 #ifndef HALF_LJ
1014         sh_mask_S2    = selectByMask(lje_vc_S, interact_S2);
1015         sh_mask_S3    = selectByMask(lje_vc_S, interact_S3);
1016 #endif
1017 #else
1018         sh_mask_S0    = lje_vc_S;
1019         sh_mask_S1    = lje_vc_S;
1020 #ifndef HALF_LJ
1021         sh_mask_S2    = lje_vc_S;
1022         sh_mask_S3    = lje_vc_S;
1023 #endif
1024 #endif
1025
1026         VLJ_S0        = fma(sixth_S * c6grid_S0, fma(rinvsix_nm_S0, fnma(expmcr2_S0, poly_S0, one_S), sh_mask_S0), VLJ_S0);
1027         VLJ_S1        = fma(sixth_S * c6grid_S1, fma(rinvsix_nm_S1, fnma(expmcr2_S1, poly_S1, one_S), sh_mask_S1), VLJ_S1);
1028 #ifndef HALF_LJ
1029         VLJ_S2        = fma(sixth_S * c6grid_S2, fma(rinvsix_nm_S2, fnma(expmcr2_S2, poly_S2, one_S), sh_mask_S2), VLJ_S2);
1030         VLJ_S3        = fma(sixth_S * c6grid_S3, fma(rinvsix_nm_S3, fnma(expmcr2_S3, poly_S3, one_S), sh_mask_S3), VLJ_S3);
1031 #endif
1032 #endif /* CALC_ENERGIES */
1033     }
1034 #endif /* LJ_EWALD_GEOM */
1035
1036 #if defined VDW_CUTOFF_CHECK
1037     /* frLJ is multiplied later by rinvsq, which is masked for the Coulomb
1038      * cut-off, but if the VdW cut-off is shorter, we need to mask with that.
1039      */
1040     frLJ_S0     = selectByMask(frLJ_S0, wco_vdw_S0);
1041     frLJ_S1     = selectByMask(frLJ_S1, wco_vdw_S1);
1042 #ifndef HALF_LJ
1043     frLJ_S2     = selectByMask(frLJ_S2, wco_vdw_S2);
1044     frLJ_S3     = selectByMask(frLJ_S3, wco_vdw_S3);
1045 #endif
1046 #endif
1047
1048 #ifdef CALC_ENERGIES
1049     /* The potential shift should be removed for pairs beyond cut-off */
1050     VLJ_S0      = selectByMask(VLJ_S0, wco_vdw_S0);
1051     VLJ_S1      = selectByMask(VLJ_S1, wco_vdw_S1);
1052 #ifndef HALF_LJ
1053     VLJ_S2      = selectByMask(VLJ_S2, wco_vdw_S2);
1054     VLJ_S3      = selectByMask(VLJ_S3, wco_vdw_S3);
1055 #endif
1056 #endif
1057
1058 #endif /* CALC_LJ */
1059
1060 #ifdef CALC_ENERGIES
1061 #ifdef ENERGY_GROUPS
1062     /* Extract the group pair index per j pair.
1063      * Energy groups are stored per i-cluster, so things get
1064      * complicated when the i- and j-cluster size don't match.
1065      */
1066     {
1067         int egps_j;
1068 #if UNROLLJ == 2
1069         egps_j    = nbatParams.energrp[cj >> 1];
1070         egp_jj[0] = ((egps_j >> ((cj & 1)*egps_jshift)) & egps_jmask)*egps_jstride;
1071 #else
1072         /* We assume UNROLLI <= UNROLLJ */
1073         int jdi;
1074         for (jdi = 0; jdi < UNROLLJ/UNROLLI; jdi++)
1075         {
1076             int jj;
1077             egps_j = nbatParams.energrp[cj*(UNROLLJ/UNROLLI) + jdi];
1078             for (jj = 0; jj < (UNROLLI/2); jj++)
1079             {
1080                 egp_jj[jdi*(UNROLLI/2)+jj] = ((egps_j >> (jj*egps_jshift)) & egps_jmask)*egps_jstride;
1081             }
1082         }
1083 #endif
1084     }
1085 #endif
1086
1087 #ifdef CALC_COULOMB
1088 #ifndef ENERGY_GROUPS
1089     vctot_S      = vctot_S + vcoul_S0 + vcoul_S1 + vcoul_S2 + vcoul_S3;
1090 #else
1091     add_ener_grp(vcoul_S0, vctp[0], egp_jj);
1092     add_ener_grp(vcoul_S1, vctp[1], egp_jj);
1093     add_ener_grp(vcoul_S2, vctp[2], egp_jj);
1094     add_ener_grp(vcoul_S3, vctp[3], egp_jj);
1095 #endif
1096 #endif
1097
1098 #ifdef CALC_LJ
1099
1100 #ifndef ENERGY_GROUPS
1101 #ifndef HALF_LJ
1102     Vvdwtot_S   = Vvdwtot_S + VLJ_S0 + VLJ_S1 + VLJ_S2 + VLJ_S3;
1103 #else
1104     Vvdwtot_S   = Vvdwtot_S + VLJ_S0 + VLJ_S1;
1105 #endif
1106 #else
1107     add_ener_grp(VLJ_S0, vvdwtp[0], egp_jj);
1108     add_ener_grp(VLJ_S1, vvdwtp[1], egp_jj);
1109 #ifndef HALF_LJ
1110     add_ener_grp(VLJ_S2, vvdwtp[2], egp_jj);
1111     add_ener_grp(VLJ_S3, vvdwtp[3], egp_jj);
1112 #endif
1113 #endif
1114 #endif /* CALC_LJ */
1115 #endif /* CALC_ENERGIES */
1116
1117 #ifdef CALC_LJ
1118 #ifdef CALC_COULOMB
1119     fscal_S0    = rinvsq_S0 * (frcoul_S0 + frLJ_S0);
1120 #else
1121     fscal_S0    = rinvsq_S0 * frLJ_S0;
1122 #endif
1123 #ifdef CALC_COULOMB
1124     fscal_S1    = rinvsq_S1 * (frcoul_S1 + frLJ_S1);
1125 #else
1126     fscal_S1    = rinvsq_S1 * frLJ_S1;
1127 #endif
1128 #else
1129     fscal_S0    = rinvsq_S0 * frcoul_S0;
1130     fscal_S1    = rinvsq_S1 * frcoul_S1;
1131 #endif /* CALC_LJ */
1132 #if defined CALC_LJ && !defined HALF_LJ
1133 #ifdef CALC_COULOMB
1134     fscal_S2    = rinvsq_S2 * (frcoul_S2 + frLJ_S2);
1135     fscal_S3    = rinvsq_S3 * (frcoul_S3 + frLJ_S3);
1136 #else
1137     fscal_S2    = rinvsq_S2 * frLJ_S2;
1138     fscal_S3    = rinvsq_S3 * frLJ_S3;
1139 #endif
1140 #else
1141     /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
1142     fscal_S2    = rinvsq_S2 * frcoul_S2;
1143     fscal_S3    = rinvsq_S3 * frcoul_S3;
1144 #endif
1145
1146     /* Calculate temporary vectorial force */
1147     tx_S0       = fscal_S0 * dx_S0;
1148     tx_S1       = fscal_S1 * dx_S1;
1149     tx_S2       = fscal_S2 * dx_S2;
1150     tx_S3       = fscal_S3 * dx_S3;
1151     ty_S0       = fscal_S0 * dy_S0;
1152     ty_S1       = fscal_S1 * dy_S1;
1153     ty_S2       = fscal_S2 * dy_S2;
1154     ty_S3       = fscal_S3 * dy_S3;
1155     tz_S0       = fscal_S0 * dz_S0;
1156     tz_S1       = fscal_S1 * dz_S1;
1157     tz_S2       = fscal_S2 * dz_S2;
1158     tz_S3       = fscal_S3 * dz_S3;
1159
1160     /* Increment i atom force */
1161     fix_S0      = fix_S0 + tx_S0;
1162     fix_S1      = fix_S1 + tx_S1;
1163     fix_S2      = fix_S2 + tx_S2;
1164     fix_S3      = fix_S3 + tx_S3;
1165     fiy_S0      = fiy_S0 + ty_S0;
1166     fiy_S1      = fiy_S1 + ty_S1;
1167     fiy_S2      = fiy_S2 + ty_S2;
1168     fiy_S3      = fiy_S3 + ty_S3;
1169     fiz_S0      = fiz_S0 + tz_S0;
1170     fiz_S1      = fiz_S1 + tz_S1;
1171     fiz_S2      = fiz_S2 + tz_S2;
1172     fiz_S3      = fiz_S3 + tz_S3;
1173
1174     /* Decrement j atom force */
1175     store(f+ajx, load<SimdReal>(f+ajx) - (tx_S0 + tx_S1 + tx_S2 + tx_S3));
1176     store(f+ajy, load<SimdReal>(f+ajy) - (ty_S0 + ty_S1 + ty_S2 + ty_S3));
1177     store(f+ajz, load<SimdReal>(f+ajz) - (tz_S0 + tz_S1 + tz_S2 + tz_S3));
1178 }
1179
1180 #undef  rinv_ex_S0
1181 #undef  rinv_ex_S1
1182 #undef  rinv_ex_S2
1183 #undef  rinv_ex_S3
1184
1185 #undef  wco_vdw_S0
1186 #undef  wco_vdw_S1
1187 #undef  wco_vdw_S2
1188 #undef  wco_vdw_S3
1189
1190 #undef  EXCL_FORCES
1191
1192 #endif // !DOXYGEN