src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_avx_128_fma_single.c

   1 /*
   2  * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  * Copyright (c) 2001-2012, The GROMACS Development Team
   9  *
  10  * Gromacs is a library for molecular simulation and trajectory analysis,
  11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  12  * a full list of developers and information, check out http://www.gromacs.org
  13  *
  14  * This program is free software; you can redistribute it and/or modify it under
  15  * the terms of the GNU Lesser General Public License as published by the Free
  16  * Software Foundation; either version 2 of the License, or (at your option) any
  17  * later version.
  18  *
  19  * To help fund GROMACS development, we humbly ask that you cite
  20  * the papers people have written on it - you can find them on the website.
  21  */
  22 #ifdef HAVE_CONFIG_H
  23 #include <config.h>
  24 #endif
  25
  26 #include <math.h>
  27
  28 #include "../nb_kernel.h"
  29 #include "types/simple.h"
  30 #include "vec.h"
  31 #include "nrnb.h"
  32
  33 #include "gmx_math_x86_avx_128_fma_single.h"
  34 #include "kernelutil_x86_avx_128_fma_single.h"
  35
  36 /*
  37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_single
  38  * Electrostatics interaction: Ewald
  39  * VdW interaction:            LennardJones
  40  * Geometry:                   Water3-Particle
  41  * Calculate force/pot:        PotentialAndForce
  42  */
  43 void
  44 nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_128_fma_single
  45                     (t_nblist * gmx_restrict                nlist,
  46                      rvec * gmx_restrict                    xx,
  47                      rvec * gmx_restrict                    ff,
  48                      t_forcerec * gmx_restrict              fr,
  49                      t_mdatoms * gmx_restrict               mdatoms,
  50                      nb_kernel_data_t * gmx_restrict        kernel_data,
  51                      t_nrnb * gmx_restrict                  nrnb)
  52 {
  53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
  54      * just 0 for non-waters.
  55      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
  56      * jnr indices corresponding to data put in the four positions in the SIMD register.
  57      */
  58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
  59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
  60     int              jnrA,jnrB,jnrC,jnrD;
  61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
  62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
  63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
  64     real             rcutoff_scalar;
  65     real             *shiftvec,*fshift,*x,*f;
  66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
  67     real             scratch[4*DIM];
  68     __m128           fscal,rcutoff,rcutoff2,jidxall;
  69     int              vdwioffset0;
  70     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
  71     int              vdwioffset1;
  72     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
  73     int              vdwioffset2;
  74     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
  75     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
  76     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
  77     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
  78     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
  79     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
  80     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
  81     real             *charge;
  82     int              nvdwtype;
  83     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
  84     int              *vdwtype;
  85     real             *vdwparam;
  86     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
  87     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
  88     __m128i          ewitab;
  89     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
  90     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
  91     real             *ewtab;
  92     __m128           dummy_mask,cutoff_mask;
  93     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
  94     __m128           one     = _mm_set1_ps(1.0);
  95     __m128           two     = _mm_set1_ps(2.0);
  96     x                = xx[0];
  97     f                = ff[0];
  98
  99     nri              = nlist->nri;
 100     iinr             = nlist->iinr;
 101     jindex           = nlist->jindex;
 102     jjnr             = nlist->jjnr;
 103     shiftidx         = nlist->shift;
 104     gid              = nlist->gid;
 105     shiftvec         = fr->shift_vec[0];
 106     fshift           = fr->fshift[0];
 107     facel            = _mm_set1_ps(fr->epsfac);
 108     charge           = mdatoms->chargeA;
 109     nvdwtype         = fr->ntype;
 110     vdwparam         = fr->nbfp;
 111     vdwtype          = mdatoms->typeA;
 112
 113     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
 114     beta             = _mm_set1_ps(fr->ic->ewaldcoeff);
 115     beta2            = _mm_mul_ps(beta,beta);
 116     beta3            = _mm_mul_ps(beta,beta2);
 117     ewtab            = fr->ic->tabq_coul_FDV0;
 118     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
 119     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
 120
 121     /* Setup water-specific parameters */
 122     inr              = nlist->iinr[0];
 123     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
 124     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
 125     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
 126     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 127
 128     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
 129     rcutoff_scalar   = fr->rcoulomb;
 130     rcutoff          = _mm_set1_ps(rcutoff_scalar);
 131     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
 132
 133     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
 134     rvdw             = _mm_set1_ps(fr->rvdw);
 135
 136     /* Avoid stupid compiler warnings */
 137     jnrA = jnrB = jnrC = jnrD = 0;
 138     j_coord_offsetA = 0;
 139     j_coord_offsetB = 0;
 140     j_coord_offsetC = 0;
 141     j_coord_offsetD = 0;
 142
 143     outeriter        = 0;
 144     inneriter        = 0;
 145
 146     for(iidx=0;iidx<4*DIM;iidx++)
 147     {
 148         scratch[iidx] = 0.0;
 149     }
 150
 151     /* Start outer loop over neighborlists */
 152     for(iidx=0; iidx<nri; iidx++)
 153     {
 154         /* Load shift vector for this list */
 155         i_shift_offset   = DIM*shiftidx[iidx];
 156
 157         /* Load limits for loop over neighbors */
 158         j_index_start    = jindex[iidx];
 159         j_index_end      = jindex[iidx+1];
 160
 161         /* Get outer coordinate index */
 162         inr              = iinr[iidx];
 163         i_coord_offset   = DIM*inr;
 164
 165         /* Load i particle coords and add shift vector */
 166         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
 167                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
 168
 169         fix0             = _mm_setzero_ps();
 170         fiy0             = _mm_setzero_ps();
 171         fiz0             = _mm_setzero_ps();
 172         fix1             = _mm_setzero_ps();
 173         fiy1             = _mm_setzero_ps();
 174         fiz1             = _mm_setzero_ps();
 175         fix2             = _mm_setzero_ps();
 176         fiy2             = _mm_setzero_ps();
 177         fiz2             = _mm_setzero_ps();
 178
 179         /* Reset potential sums */
 180         velecsum         = _mm_setzero_ps();
 181         vvdwsum          = _mm_setzero_ps();
 182
 183         /* Start inner kernel loop */
 184         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 185         {
 186
 187             /* Get j neighbor index, and coordinate index */
 188             jnrA             = jjnr[jidx];
 189             jnrB             = jjnr[jidx+1];
 190             jnrC             = jjnr[jidx+2];
 191             jnrD             = jjnr[jidx+3];
 192             j_coord_offsetA  = DIM*jnrA;
 193             j_coord_offsetB  = DIM*jnrB;
 194             j_coord_offsetC  = DIM*jnrC;
 195             j_coord_offsetD  = DIM*jnrD;
 196
 197             /* load j atom coordinates */
 198             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 199                                               x+j_coord_offsetC,x+j_coord_offsetD,
 200                                               &jx0,&jy0,&jz0);
 201
 202             /* Calculate displacement vector */
 203             dx00             = _mm_sub_ps(ix0,jx0);
 204             dy00             = _mm_sub_ps(iy0,jy0);
 205             dz00             = _mm_sub_ps(iz0,jz0);
 206             dx10             = _mm_sub_ps(ix1,jx0);
 207             dy10             = _mm_sub_ps(iy1,jy0);
 208             dz10             = _mm_sub_ps(iz1,jz0);
 209             dx20             = _mm_sub_ps(ix2,jx0);
 210             dy20             = _mm_sub_ps(iy2,jy0);
 211             dz20             = _mm_sub_ps(iz2,jz0);
 212
 213             /* Calculate squared distance and things based on it */
 214             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 215             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
 216             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
 217
 218             rinv00           = gmx_mm_invsqrt_ps(rsq00);
 219             rinv10           = gmx_mm_invsqrt_ps(rsq10);
 220             rinv20           = gmx_mm_invsqrt_ps(rsq20);
 221
 222             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 223             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
 224             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
 225
 226             /* Load parameters for j particles */
 227             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 228                                                               charge+jnrC+0,charge+jnrD+0);
 229             vdwjidx0A        = 2*vdwtype[jnrA+0];
 230             vdwjidx0B        = 2*vdwtype[jnrB+0];
 231             vdwjidx0C        = 2*vdwtype[jnrC+0];
 232             vdwjidx0D        = 2*vdwtype[jnrD+0];
 233
 234             fjx0             = _mm_setzero_ps();
 235             fjy0             = _mm_setzero_ps();
 236             fjz0             = _mm_setzero_ps();
 237
 238             /**************************
 239              * CALCULATE INTERACTIONS *
 240              **************************/
 241
 242             if (gmx_mm_any_lt(rsq00,rcutoff2))
 243             {
 244
 245             r00              = _mm_mul_ps(rsq00,rinv00);
 246
 247             /* Compute parameters for interactions between i and j atoms */
 248             qq00             = _mm_mul_ps(iq0,jq0);
 249             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 250                                          vdwparam+vdwioffset0+vdwjidx0B,
 251                                          vdwparam+vdwioffset0+vdwjidx0C,
 252                                          vdwparam+vdwioffset0+vdwjidx0D,
 253                                          &c6_00,&c12_00);
 254
 255             /* EWALD ELECTROSTATICS */
 256
 257             /* Analytical PME correction */
 258             zeta2            = _mm_mul_ps(beta2,rsq00);
 259             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
 260             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
 261             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
 262             felec            = _mm_mul_ps(qq00,felec);
 263             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
 264             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv00,sh_ewald));
 265             velec            = _mm_mul_ps(qq00,velec);
 266
 267             /* LENNARD-JONES DISPERSION/REPULSION */
 268
 269             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 270             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
 271             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
 272             vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
 273                                           _mm_mul_ps( _mm_nmacc_ps(c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
 274             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
 275
 276             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
 277
 278             /* Update potential sum for this i atom from the interaction with this j atom. */
 279             velec            = _mm_and_ps(velec,cutoff_mask);
 280             velecsum         = _mm_add_ps(velecsum,velec);
 281             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
 282             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 283
 284             fscal            = _mm_add_ps(felec,fvdw);
 285
 286             fscal            = _mm_and_ps(fscal,cutoff_mask);
 287
 288              /* Update vectorial force */
 289             fix0             = _mm_macc_ps(dx00,fscal,fix0);
 290             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 291             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 292
 293             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
 294             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
 295             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 296
 297             }
 298
 299             /**************************
 300              * CALCULATE INTERACTIONS *
 301              **************************/
 302
 303             if (gmx_mm_any_lt(rsq10,rcutoff2))
 304             {
 305
 306             r10              = _mm_mul_ps(rsq10,rinv10);
 307
 308             /* Compute parameters for interactions between i and j atoms */
 309             qq10             = _mm_mul_ps(iq1,jq0);
 310
 311             /* EWALD ELECTROSTATICS */
 312
 313             /* Analytical PME correction */
 314             zeta2            = _mm_mul_ps(beta2,rsq10);
 315             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
 316             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
 317             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
 318             felec            = _mm_mul_ps(qq10,felec);
 319             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
 320             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv10,sh_ewald));
 321             velec            = _mm_mul_ps(qq10,velec);
 322
 323             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
 324
 325             /* Update potential sum for this i atom from the interaction with this j atom. */
 326             velec            = _mm_and_ps(velec,cutoff_mask);
 327             velecsum         = _mm_add_ps(velecsum,velec);
 328
 329             fscal            = felec;
 330
 331             fscal            = _mm_and_ps(fscal,cutoff_mask);
 332
 333              /* Update vectorial force */
 334             fix1             = _mm_macc_ps(dx10,fscal,fix1);
 335             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
 336             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 337
 338             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
 339             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
 340             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 341
 342             }
 343
 344             /**************************
 345              * CALCULATE INTERACTIONS *
 346              **************************/
 347
 348             if (gmx_mm_any_lt(rsq20,rcutoff2))
 349             {
 350
 351             r20              = _mm_mul_ps(rsq20,rinv20);
 352
 353             /* Compute parameters for interactions between i and j atoms */
 354             qq20             = _mm_mul_ps(iq2,jq0);
 355
 356             /* EWALD ELECTROSTATICS */
 357
 358             /* Analytical PME correction */
 359             zeta2            = _mm_mul_ps(beta2,rsq20);
 360             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
 361             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
 362             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
 363             felec            = _mm_mul_ps(qq20,felec);
 364             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
 365             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv20,sh_ewald));
 366             velec            = _mm_mul_ps(qq20,velec);
 367
 368             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
 369
 370             /* Update potential sum for this i atom from the interaction with this j atom. */
 371             velec            = _mm_and_ps(velec,cutoff_mask);
 372             velecsum         = _mm_add_ps(velecsum,velec);
 373
 374             fscal            = felec;
 375
 376             fscal            = _mm_and_ps(fscal,cutoff_mask);
 377
 378              /* Update vectorial force */
 379             fix2             = _mm_macc_ps(dx20,fscal,fix2);
 380             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
 381             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 382
 383             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
 384             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
 385             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 386
 387             }
 388
 389             fjptrA             = f+j_coord_offsetA;
 390             fjptrB             = f+j_coord_offsetB;
 391             fjptrC             = f+j_coord_offsetC;
 392             fjptrD             = f+j_coord_offsetD;
 393
 394             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 395
 396             /* Inner loop uses 117 flops */
 397         }
 398
 399         if(jidx<j_index_end)
 400         {
 401
 402             /* Get j neighbor index, and coordinate index */
 403             jnrlistA         = jjnr[jidx];
 404             jnrlistB         = jjnr[jidx+1];
 405             jnrlistC         = jjnr[jidx+2];
 406             jnrlistD         = jjnr[jidx+3];
 407             /* Sign of each element will be negative for non-real atoms.
 408              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 409              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 410              */
 411             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 412             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 413             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 414             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 415             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 416             j_coord_offsetA  = DIM*jnrA;
 417             j_coord_offsetB  = DIM*jnrB;
 418             j_coord_offsetC  = DIM*jnrC;
 419             j_coord_offsetD  = DIM*jnrD;
 420
 421             /* load j atom coordinates */
 422             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 423                                               x+j_coord_offsetC,x+j_coord_offsetD,
 424                                               &jx0,&jy0,&jz0);
 425
 426             /* Calculate displacement vector */
 427             dx00             = _mm_sub_ps(ix0,jx0);
 428             dy00             = _mm_sub_ps(iy0,jy0);
 429             dz00             = _mm_sub_ps(iz0,jz0);
 430             dx10             = _mm_sub_ps(ix1,jx0);
 431             dy10             = _mm_sub_ps(iy1,jy0);
 432             dz10             = _mm_sub_ps(iz1,jz0);
 433             dx20             = _mm_sub_ps(ix2,jx0);
 434             dy20             = _mm_sub_ps(iy2,jy0);
 435             dz20             = _mm_sub_ps(iz2,jz0);
 436
 437             /* Calculate squared distance and things based on it */
 438             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 439             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
 440             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
 441
 442             rinv00           = gmx_mm_invsqrt_ps(rsq00);
 443             rinv10           = gmx_mm_invsqrt_ps(rsq10);
 444             rinv20           = gmx_mm_invsqrt_ps(rsq20);
 445
 446             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 447             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
 448             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
 449
 450             /* Load parameters for j particles */
 451             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 452                                                               charge+jnrC+0,charge+jnrD+0);
 453             vdwjidx0A        = 2*vdwtype[jnrA+0];
 454             vdwjidx0B        = 2*vdwtype[jnrB+0];
 455             vdwjidx0C        = 2*vdwtype[jnrC+0];
 456             vdwjidx0D        = 2*vdwtype[jnrD+0];
 457
 458             fjx0             = _mm_setzero_ps();
 459             fjy0             = _mm_setzero_ps();
 460             fjz0             = _mm_setzero_ps();
 461
 462             /**************************
 463              * CALCULATE INTERACTIONS *
 464              **************************/
 465
 466             if (gmx_mm_any_lt(rsq00,rcutoff2))
 467             {
 468
 469             r00              = _mm_mul_ps(rsq00,rinv00);
 470             r00              = _mm_andnot_ps(dummy_mask,r00);
 471
 472             /* Compute parameters for interactions between i and j atoms */
 473             qq00             = _mm_mul_ps(iq0,jq0);
 474             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 475                                          vdwparam+vdwioffset0+vdwjidx0B,
 476                                          vdwparam+vdwioffset0+vdwjidx0C,
 477                                          vdwparam+vdwioffset0+vdwjidx0D,
 478                                          &c6_00,&c12_00);
 479
 480             /* EWALD ELECTROSTATICS */
 481
 482             /* Analytical PME correction */
 483             zeta2            = _mm_mul_ps(beta2,rsq00);
 484             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
 485             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
 486             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
 487             felec            = _mm_mul_ps(qq00,felec);
 488             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
 489             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv00,sh_ewald));
 490             velec            = _mm_mul_ps(qq00,velec);
 491
 492             /* LENNARD-JONES DISPERSION/REPULSION */
 493
 494             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 495             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
 496             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
 497             vvdw             = _mm_msub_ps(_mm_nmacc_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
 498                                           _mm_mul_ps( _mm_nmacc_ps(c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
 499             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
 500
 501             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
 502
 503             /* Update potential sum for this i atom from the interaction with this j atom. */
 504             velec            = _mm_and_ps(velec,cutoff_mask);
 505             velec            = _mm_andnot_ps(dummy_mask,velec);
 506             velecsum         = _mm_add_ps(velecsum,velec);
 507             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
 508             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
 509             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 510
 511             fscal            = _mm_add_ps(felec,fvdw);
 512
 513             fscal            = _mm_and_ps(fscal,cutoff_mask);
 514
 515             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 516
 517              /* Update vectorial force */
 518             fix0             = _mm_macc_ps(dx00,fscal,fix0);
 519             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 520             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 521
 522             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
 523             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
 524             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 525
 526             }
 527
 528             /**************************
 529              * CALCULATE INTERACTIONS *
 530              **************************/
 531
 532             if (gmx_mm_any_lt(rsq10,rcutoff2))
 533             {
 534
 535             r10              = _mm_mul_ps(rsq10,rinv10);
 536             r10              = _mm_andnot_ps(dummy_mask,r10);
 537
 538             /* Compute parameters for interactions between i and j atoms */
 539             qq10             = _mm_mul_ps(iq1,jq0);
 540
 541             /* EWALD ELECTROSTATICS */
 542
 543             /* Analytical PME correction */
 544             zeta2            = _mm_mul_ps(beta2,rsq10);
 545             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
 546             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
 547             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
 548             felec            = _mm_mul_ps(qq10,felec);
 549             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
 550             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv10,sh_ewald));
 551             velec            = _mm_mul_ps(qq10,velec);
 552
 553             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
 554
 555             /* Update potential sum for this i atom from the interaction with this j atom. */
 556             velec            = _mm_and_ps(velec,cutoff_mask);
 557             velec            = _mm_andnot_ps(dummy_mask,velec);
 558             velecsum         = _mm_add_ps(velecsum,velec);
 559
 560             fscal            = felec;
 561
 562             fscal            = _mm_and_ps(fscal,cutoff_mask);
 563
 564             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 565
 566              /* Update vectorial force */
 567             fix1             = _mm_macc_ps(dx10,fscal,fix1);
 568             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
 569             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 570
 571             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
 572             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
 573             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 574
 575             }
 576
 577             /**************************
 578              * CALCULATE INTERACTIONS *
 579              **************************/
 580
 581             if (gmx_mm_any_lt(rsq20,rcutoff2))
 582             {
 583
 584             r20              = _mm_mul_ps(rsq20,rinv20);
 585             r20              = _mm_andnot_ps(dummy_mask,r20);
 586
 587             /* Compute parameters for interactions between i and j atoms */
 588             qq20             = _mm_mul_ps(iq2,jq0);
 589
 590             /* EWALD ELECTROSTATICS */
 591
 592             /* Analytical PME correction */
 593             zeta2            = _mm_mul_ps(beta2,rsq20);
 594             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
 595             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
 596             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
 597             felec            = _mm_mul_ps(qq20,felec);
 598             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
 599             velec            = _mm_nmacc_ps(pmecorrV,beta,_mm_sub_ps(rinv20,sh_ewald));
 600             velec            = _mm_mul_ps(qq20,velec);
 601
 602             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
 603
 604             /* Update potential sum for this i atom from the interaction with this j atom. */
 605             velec            = _mm_and_ps(velec,cutoff_mask);
 606             velec            = _mm_andnot_ps(dummy_mask,velec);
 607             velecsum         = _mm_add_ps(velecsum,velec);
 608
 609             fscal            = felec;
 610
 611             fscal            = _mm_and_ps(fscal,cutoff_mask);
 612
 613             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 614
 615              /* Update vectorial force */
 616             fix2             = _mm_macc_ps(dx20,fscal,fix2);
 617             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
 618             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 619
 620             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
 621             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
 622             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 623
 624             }
 625
 626             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 627             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 628             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 629             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 630
 631             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 632
 633             /* Inner loop uses 120 flops */
 634         }
 635
 636         /* End of innermost loop */
 637
 638         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
 639                                               f+i_coord_offset,fshift+i_shift_offset);
 640
 641         ggid                        = gid[iidx];
 642         /* Update potential energies */
 643         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 644         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
 645
 646         /* Increment number of inner iterations */
 647         inneriter                  += j_index_end - j_index_start;
 648
 649         /* Outer loop uses 20 flops */
 650     }
 651
 652     /* Increment number of outer iterations */
 653     outeriter        += nri;
 654
 655     /* Update outer/inner flops */
 656
 657     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*120);
 658 }
 659 /*
 660  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_128_fma_single
 661  * Electrostatics interaction: Ewald
 662  * VdW interaction:            LennardJones
 663  * Geometry:                   Water3-Particle
 664  * Calculate force/pot:        Force
 665  */
 666 void
 667 nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_128_fma_single
 668                     (t_nblist * gmx_restrict                nlist,
 669                      rvec * gmx_restrict                    xx,
 670                      rvec * gmx_restrict                    ff,
 671                      t_forcerec * gmx_restrict              fr,
 672                      t_mdatoms * gmx_restrict               mdatoms,
 673                      nb_kernel_data_t * gmx_restrict        kernel_data,
 674                      t_nrnb * gmx_restrict                  nrnb)
 675 {
 676     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 677      * just 0 for non-waters.
 678      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
 679      * jnr indices corresponding to data put in the four positions in the SIMD register.
 680      */
 681     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 682     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 683     int              jnrA,jnrB,jnrC,jnrD;
 684     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 685     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 686     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 687     real             rcutoff_scalar;
 688     real             *shiftvec,*fshift,*x,*f;
 689     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 690     real             scratch[4*DIM];
 691     __m128           fscal,rcutoff,rcutoff2,jidxall;
 692     int              vdwioffset0;
 693     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 694     int              vdwioffset1;
 695     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
 696     int              vdwioffset2;
 697     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
 698     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 699     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 700     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 701     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
 702     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
 703     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 704     real             *charge;
 705     int              nvdwtype;
 706     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 707     int              *vdwtype;
 708     real             *vdwparam;
 709     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 710     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 711     __m128i          ewitab;
 712     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
 713     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
 714     real             *ewtab;
 715     __m128           dummy_mask,cutoff_mask;
 716     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 717     __m128           one     = _mm_set1_ps(1.0);
 718     __m128           two     = _mm_set1_ps(2.0);
 719     x                = xx[0];
 720     f                = ff[0];
 721
 722     nri              = nlist->nri;
 723     iinr             = nlist->iinr;
 724     jindex           = nlist->jindex;
 725     jjnr             = nlist->jjnr;
 726     shiftidx         = nlist->shift;
 727     gid              = nlist->gid;
 728     shiftvec         = fr->shift_vec[0];
 729     fshift           = fr->fshift[0];
 730     facel            = _mm_set1_ps(fr->epsfac);
 731     charge           = mdatoms->chargeA;
 732     nvdwtype         = fr->ntype;
 733     vdwparam         = fr->nbfp;
 734     vdwtype          = mdatoms->typeA;
 735
 736     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
 737     beta             = _mm_set1_ps(fr->ic->ewaldcoeff);
 738     beta2            = _mm_mul_ps(beta,beta);
 739     beta3            = _mm_mul_ps(beta,beta2);
 740     ewtab            = fr->ic->tabq_coul_F;
 741     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
 742     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
 743
 744     /* Setup water-specific parameters */
 745     inr              = nlist->iinr[0];
 746     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
 747     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
 748     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
 749     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 750
 751     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
 752     rcutoff_scalar   = fr->rcoulomb;
 753     rcutoff          = _mm_set1_ps(rcutoff_scalar);
 754     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
 755
 756     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
 757     rvdw             = _mm_set1_ps(fr->rvdw);
 758
 759     /* Avoid stupid compiler warnings */
 760     jnrA = jnrB = jnrC = jnrD = 0;
 761     j_coord_offsetA = 0;
 762     j_coord_offsetB = 0;
 763     j_coord_offsetC = 0;
 764     j_coord_offsetD = 0;
 765
 766     outeriter        = 0;
 767     inneriter        = 0;
 768
 769     for(iidx=0;iidx<4*DIM;iidx++)
 770     {
 771         scratch[iidx] = 0.0;
 772     }
 773
 774     /* Start outer loop over neighborlists */
 775     for(iidx=0; iidx<nri; iidx++)
 776     {
 777         /* Load shift vector for this list */
 778         i_shift_offset   = DIM*shiftidx[iidx];
 779
 780         /* Load limits for loop over neighbors */
 781         j_index_start    = jindex[iidx];
 782         j_index_end      = jindex[iidx+1];
 783
 784         /* Get outer coordinate index */
 785         inr              = iinr[iidx];
 786         i_coord_offset   = DIM*inr;
 787
 788         /* Load i particle coords and add shift vector */
 789         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
 790                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
 791
 792         fix0             = _mm_setzero_ps();
 793         fiy0             = _mm_setzero_ps();
 794         fiz0             = _mm_setzero_ps();
 795         fix1             = _mm_setzero_ps();
 796         fiy1             = _mm_setzero_ps();
 797         fiz1             = _mm_setzero_ps();
 798         fix2             = _mm_setzero_ps();
 799         fiy2             = _mm_setzero_ps();
 800         fiz2             = _mm_setzero_ps();
 801
 802         /* Start inner kernel loop */
 803         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 804         {
 805
 806             /* Get j neighbor index, and coordinate index */
 807             jnrA             = jjnr[jidx];
 808             jnrB             = jjnr[jidx+1];
 809             jnrC             = jjnr[jidx+2];
 810             jnrD             = jjnr[jidx+3];
 811             j_coord_offsetA  = DIM*jnrA;
 812             j_coord_offsetB  = DIM*jnrB;
 813             j_coord_offsetC  = DIM*jnrC;
 814             j_coord_offsetD  = DIM*jnrD;
 815
 816             /* load j atom coordinates */
 817             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 818                                               x+j_coord_offsetC,x+j_coord_offsetD,
 819                                               &jx0,&jy0,&jz0);
 820
 821             /* Calculate displacement vector */
 822             dx00             = _mm_sub_ps(ix0,jx0);
 823             dy00             = _mm_sub_ps(iy0,jy0);
 824             dz00             = _mm_sub_ps(iz0,jz0);
 825             dx10             = _mm_sub_ps(ix1,jx0);
 826             dy10             = _mm_sub_ps(iy1,jy0);
 827             dz10             = _mm_sub_ps(iz1,jz0);
 828             dx20             = _mm_sub_ps(ix2,jx0);
 829             dy20             = _mm_sub_ps(iy2,jy0);
 830             dz20             = _mm_sub_ps(iz2,jz0);
 831
 832             /* Calculate squared distance and things based on it */
 833             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 834             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
 835             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
 836
 837             rinv00           = gmx_mm_invsqrt_ps(rsq00);
 838             rinv10           = gmx_mm_invsqrt_ps(rsq10);
 839             rinv20           = gmx_mm_invsqrt_ps(rsq20);
 840
 841             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 842             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
 843             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
 844
 845             /* Load parameters for j particles */
 846             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
 847                                                               charge+jnrC+0,charge+jnrD+0);
 848             vdwjidx0A        = 2*vdwtype[jnrA+0];
 849             vdwjidx0B        = 2*vdwtype[jnrB+0];
 850             vdwjidx0C        = 2*vdwtype[jnrC+0];
 851             vdwjidx0D        = 2*vdwtype[jnrD+0];
 852
 853             fjx0             = _mm_setzero_ps();
 854             fjy0             = _mm_setzero_ps();
 855             fjz0             = _mm_setzero_ps();
 856
 857             /**************************
 858              * CALCULATE INTERACTIONS *
 859              **************************/
 860
 861             if (gmx_mm_any_lt(rsq00,rcutoff2))
 862             {
 863
 864             r00              = _mm_mul_ps(rsq00,rinv00);
 865
 866             /* Compute parameters for interactions between i and j atoms */
 867             qq00             = _mm_mul_ps(iq0,jq0);
 868             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
 869                                          vdwparam+vdwioffset0+vdwjidx0B,
 870                                          vdwparam+vdwioffset0+vdwjidx0C,
 871                                          vdwparam+vdwioffset0+vdwjidx0D,
 872                                          &c6_00,&c12_00);
 873
 874             /* EWALD ELECTROSTATICS */
 875
 876             /* Analytical PME correction */
 877             zeta2            = _mm_mul_ps(beta2,rsq00);
 878             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
 879             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
 880             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
 881             felec            = _mm_mul_ps(qq00,felec);
 882
 883             /* LENNARD-JONES DISPERSION/REPULSION */
 884
 885             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 886             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
 887
 888             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
 889
 890             fscal            = _mm_add_ps(felec,fvdw);
 891
 892             fscal            = _mm_and_ps(fscal,cutoff_mask);
 893
 894              /* Update vectorial force */
 895             fix0             = _mm_macc_ps(dx00,fscal,fix0);
 896             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 897             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 898
 899             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
 900             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
 901             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 902
 903             }
 904
 905             /**************************
 906              * CALCULATE INTERACTIONS *
 907              **************************/
 908
 909             if (gmx_mm_any_lt(rsq10,rcutoff2))
 910             {
 911
 912             r10              = _mm_mul_ps(rsq10,rinv10);
 913
 914             /* Compute parameters for interactions between i and j atoms */
 915             qq10             = _mm_mul_ps(iq1,jq0);
 916
 917             /* EWALD ELECTROSTATICS */
 918
 919             /* Analytical PME correction */
 920             zeta2            = _mm_mul_ps(beta2,rsq10);
 921             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
 922             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
 923             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
 924             felec            = _mm_mul_ps(qq10,felec);
 925
 926             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
 927
 928             fscal            = felec;
 929
 930             fscal            = _mm_and_ps(fscal,cutoff_mask);
 931
 932              /* Update vectorial force */
 933             fix1             = _mm_macc_ps(dx10,fscal,fix1);
 934             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
 935             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 936
 937             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
 938             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
 939             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 940
 941             }
 942
 943             /**************************
 944              * CALCULATE INTERACTIONS *
 945              **************************/
 946
 947             if (gmx_mm_any_lt(rsq20,rcutoff2))
 948             {
 949
 950             r20              = _mm_mul_ps(rsq20,rinv20);
 951
 952             /* Compute parameters for interactions between i and j atoms */
 953             qq20             = _mm_mul_ps(iq2,jq0);
 954
 955             /* EWALD ELECTROSTATICS */
 956
 957             /* Analytical PME correction */
 958             zeta2            = _mm_mul_ps(beta2,rsq20);
 959             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
 960             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
 961             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
 962             felec            = _mm_mul_ps(qq20,felec);
 963
 964             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
 965
 966             fscal            = felec;
 967
 968             fscal            = _mm_and_ps(fscal,cutoff_mask);
 969
 970              /* Update vectorial force */
 971             fix2             = _mm_macc_ps(dx20,fscal,fix2);
 972             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
 973             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 974
 975             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
 976             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
 977             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 978
 979             }
 980
 981             fjptrA             = f+j_coord_offsetA;
 982             fjptrB             = f+j_coord_offsetB;
 983             fjptrC             = f+j_coord_offsetC;
 984             fjptrD             = f+j_coord_offsetD;
 985
 986             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
 987
 988             /* Inner loop uses 100 flops */
 989         }
 990
 991         if(jidx<j_index_end)
 992         {
 993
 994             /* Get j neighbor index, and coordinate index */
 995             jnrlistA         = jjnr[jidx];
 996             jnrlistB         = jjnr[jidx+1];
 997             jnrlistC         = jjnr[jidx+2];
 998             jnrlistD         = jjnr[jidx+3];
 999             /* Sign of each element will be negative for non-real atoms.
1000              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1001              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1002              */
1003             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1004             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1005             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1006             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1007             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1008             j_coord_offsetA  = DIM*jnrA;
1009             j_coord_offsetB  = DIM*jnrB;
1010             j_coord_offsetC  = DIM*jnrC;
1011             j_coord_offsetD  = DIM*jnrD;
1012
1013             /* load j atom coordinates */
1014             gmx_mm_load_1rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1015                                               x+j_coord_offsetC,x+j_coord_offsetD,
1016                                               &jx0,&jy0,&jz0);
1017
1018             /* Calculate displacement vector */
1019             dx00             = _mm_sub_ps(ix0,jx0);
1020             dy00             = _mm_sub_ps(iy0,jy0);
1021             dz00             = _mm_sub_ps(iz0,jz0);
1022             dx10             = _mm_sub_ps(ix1,jx0);
1023             dy10             = _mm_sub_ps(iy1,jy0);
1024             dz10             = _mm_sub_ps(iz1,jz0);
1025             dx20             = _mm_sub_ps(ix2,jx0);
1026             dy20             = _mm_sub_ps(iy2,jy0);
1027             dz20             = _mm_sub_ps(iz2,jz0);
1028
1029             /* Calculate squared distance and things based on it */
1030             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1031             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1032             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1033
1034             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1035             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1036             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1037
1038             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1039             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1040             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1041
1042             /* Load parameters for j particles */
1043             jq0              = gmx_mm_load_4real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1044                                                               charge+jnrC+0,charge+jnrD+0);
1045             vdwjidx0A        = 2*vdwtype[jnrA+0];
1046             vdwjidx0B        = 2*vdwtype[jnrB+0];
1047             vdwjidx0C        = 2*vdwtype[jnrC+0];
1048             vdwjidx0D        = 2*vdwtype[jnrD+0];
1049
1050             fjx0             = _mm_setzero_ps();
1051             fjy0             = _mm_setzero_ps();
1052             fjz0             = _mm_setzero_ps();
1053
1054             /**************************
1055              * CALCULATE INTERACTIONS *
1056              **************************/
1057
1058             if (gmx_mm_any_lt(rsq00,rcutoff2))
1059             {
1060
1061             r00              = _mm_mul_ps(rsq00,rinv00);
1062             r00              = _mm_andnot_ps(dummy_mask,r00);
1063
1064             /* Compute parameters for interactions between i and j atoms */
1065             qq00             = _mm_mul_ps(iq0,jq0);
1066             gmx_mm_load_4pair_swizzle_ps(vdwparam+vdwioffset0+vdwjidx0A,
1067                                          vdwparam+vdwioffset0+vdwjidx0B,
1068                                          vdwparam+vdwioffset0+vdwjidx0C,
1069                                          vdwparam+vdwioffset0+vdwjidx0D,
1070                                          &c6_00,&c12_00);
1071
1072             /* EWALD ELECTROSTATICS */
1073
1074             /* Analytical PME correction */
1075             zeta2            = _mm_mul_ps(beta2,rsq00);
1076             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
1077             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1078             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1079             felec            = _mm_mul_ps(qq00,felec);
1080
1081             /* LENNARD-JONES DISPERSION/REPULSION */
1082
1083             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1084             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1085
1086             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1087
1088             fscal            = _mm_add_ps(felec,fvdw);
1089
1090             fscal            = _mm_and_ps(fscal,cutoff_mask);
1091
1092             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1093
1094              /* Update vectorial force */
1095             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1096             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1097             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1098
1099             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1100             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1101             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1102
1103             }
1104
1105             /**************************
1106              * CALCULATE INTERACTIONS *
1107              **************************/
1108
1109             if (gmx_mm_any_lt(rsq10,rcutoff2))
1110             {
1111
1112             r10              = _mm_mul_ps(rsq10,rinv10);
1113             r10              = _mm_andnot_ps(dummy_mask,r10);
1114
1115             /* Compute parameters for interactions between i and j atoms */
1116             qq10             = _mm_mul_ps(iq1,jq0);
1117
1118             /* EWALD ELECTROSTATICS */
1119
1120             /* Analytical PME correction */
1121             zeta2            = _mm_mul_ps(beta2,rsq10);
1122             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1123             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1124             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1125             felec            = _mm_mul_ps(qq10,felec);
1126
1127             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1128
1129             fscal            = felec;
1130
1131             fscal            = _mm_and_ps(fscal,cutoff_mask);
1132
1133             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1134
1135              /* Update vectorial force */
1136             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1137             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1138             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1139
1140             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1141             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1142             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1143
1144             }
1145
1146             /**************************
1147              * CALCULATE INTERACTIONS *
1148              **************************/
1149
1150             if (gmx_mm_any_lt(rsq20,rcutoff2))
1151             {
1152
1153             r20              = _mm_mul_ps(rsq20,rinv20);
1154             r20              = _mm_andnot_ps(dummy_mask,r20);
1155
1156             /* Compute parameters for interactions between i and j atoms */
1157             qq20             = _mm_mul_ps(iq2,jq0);
1158
1159             /* EWALD ELECTROSTATICS */
1160
1161             /* Analytical PME correction */
1162             zeta2            = _mm_mul_ps(beta2,rsq20);
1163             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
1164             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1165             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1166             felec            = _mm_mul_ps(qq20,felec);
1167
1168             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1169
1170             fscal            = felec;
1171
1172             fscal            = _mm_and_ps(fscal,cutoff_mask);
1173
1174             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1175
1176              /* Update vectorial force */
1177             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1178             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1179             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1180
1181             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1182             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1183             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1184
1185             }
1186
1187             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1188             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1189             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1190             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1191
1192             gmx_mm_decrement_1rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1193
1194             /* Inner loop uses 103 flops */
1195         }
1196
1197         /* End of innermost loop */
1198
1199         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1200                                               f+i_coord_offset,fshift+i_shift_offset);
1201
1202         /* Increment number of inner iterations */
1203         inneriter                  += j_index_end - j_index_start;
1204
1205         /* Outer loop uses 18 flops */
1206     }
1207
1208     /* Increment number of outer iterations */
1209     outeriter        += nri;
1210
1211     /* Update outer/inner flops */
1212
1213     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*103);
1214 }