src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCoul_VdwLJ_GeomW3W3_avx_128_fma_single.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*
  36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
  37  */
  38 #include "config.h"
  39
  40 #include <math.h>
  41
  42 #include "../nb_kernel.h"
  43 #include "gromacs/legacyheaders/types/simple.h"
  44 #include "gromacs/math/vec.h"
  45 #include "gromacs/legacyheaders/nrnb.h"
  46
  47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
  48 #include "kernelutil_x86_avx_128_fma_single.h"
  49
  50 /*
  51  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_128_fma_single
  52  * Electrostatics interaction: Coulomb
  53  * VdW interaction:            LennardJones
  54  * Geometry:                   Water3-Water3
  55  * Calculate force/pot:        PotentialAndForce
  56  */
  57 void
  58 nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_128_fma_single
  59                     (t_nblist                    * gmx_restrict       nlist,
  60                      rvec                        * gmx_restrict          xx,
  61                      rvec                        * gmx_restrict          ff,
  62                      t_forcerec                  * gmx_restrict          fr,
  63                      t_mdatoms                   * gmx_restrict     mdatoms,
  64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
  65                      t_nrnb                      * gmx_restrict        nrnb)
  66 {
  67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
  68      * just 0 for non-waters.
  69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
  70      * jnr indices corresponding to data put in the four positions in the SIMD register.
  71      */
  72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
  73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
  74     int              jnrA,jnrB,jnrC,jnrD;
  75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
  76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
  77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
  78     real             rcutoff_scalar;
  79     real             *shiftvec,*fshift,*x,*f;
  80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
  81     real             scratch[4*DIM];
  82     __m128           fscal,rcutoff,rcutoff2,jidxall;
  83     int              vdwioffset0;
  84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
  85     int              vdwioffset1;
  86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
  87     int              vdwioffset2;
  88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
  89     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
  90     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
  91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
  92     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
  93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
  94     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
  95     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
  96     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
  97     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
  98     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
  99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
 100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
 101     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
 102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
 103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
 104     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 105     real             *charge;
 106     int              nvdwtype;
 107     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 108     int              *vdwtype;
 109     real             *vdwparam;
 110     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 111     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 112     __m128           dummy_mask,cutoff_mask;
 113     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 114     __m128           one     = _mm_set1_ps(1.0);
 115     __m128           two     = _mm_set1_ps(2.0);
 116     x                = xx[0];
 117     f                = ff[0];
 118
 119     nri              = nlist->nri;
 120     iinr             = nlist->iinr;
 121     jindex           = nlist->jindex;
 122     jjnr             = nlist->jjnr;
 123     shiftidx         = nlist->shift;
 124     gid              = nlist->gid;
 125     shiftvec         = fr->shift_vec[0];
 126     fshift           = fr->fshift[0];
 127     facel            = _mm_set1_ps(fr->epsfac);
 128     charge           = mdatoms->chargeA;
 129     nvdwtype         = fr->ntype;
 130     vdwparam         = fr->nbfp;
 131     vdwtype          = mdatoms->typeA;
 132
 133     /* Setup water-specific parameters */
 134     inr              = nlist->iinr[0];
 135     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
 136     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
 137     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
 138     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 139
 140     jq0              = _mm_set1_ps(charge[inr+0]);
 141     jq1              = _mm_set1_ps(charge[inr+1]);
 142     jq2              = _mm_set1_ps(charge[inr+2]);
 143     vdwjidx0A        = 2*vdwtype[inr+0];
 144     qq00             = _mm_mul_ps(iq0,jq0);
 145     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
 146     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
 147     qq01             = _mm_mul_ps(iq0,jq1);
 148     qq02             = _mm_mul_ps(iq0,jq2);
 149     qq10             = _mm_mul_ps(iq1,jq0);
 150     qq11             = _mm_mul_ps(iq1,jq1);
 151     qq12             = _mm_mul_ps(iq1,jq2);
 152     qq20             = _mm_mul_ps(iq2,jq0);
 153     qq21             = _mm_mul_ps(iq2,jq1);
 154     qq22             = _mm_mul_ps(iq2,jq2);
 155
 156     /* Avoid stupid compiler warnings */
 157     jnrA = jnrB = jnrC = jnrD = 0;
 158     j_coord_offsetA = 0;
 159     j_coord_offsetB = 0;
 160     j_coord_offsetC = 0;
 161     j_coord_offsetD = 0;
 162
 163     outeriter        = 0;
 164     inneriter        = 0;
 165
 166     for(iidx=0;iidx<4*DIM;iidx++)
 167     {
 168         scratch[iidx] = 0.0;
 169     }
 170
 171     /* Start outer loop over neighborlists */
 172     for(iidx=0; iidx<nri; iidx++)
 173     {
 174         /* Load shift vector for this list */
 175         i_shift_offset   = DIM*shiftidx[iidx];
 176
 177         /* Load limits for loop over neighbors */
 178         j_index_start    = jindex[iidx];
 179         j_index_end      = jindex[iidx+1];
 180
 181         /* Get outer coordinate index */
 182         inr              = iinr[iidx];
 183         i_coord_offset   = DIM*inr;
 184
 185         /* Load i particle coords and add shift vector */
 186         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
 187                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
 188
 189         fix0             = _mm_setzero_ps();
 190         fiy0             = _mm_setzero_ps();
 191         fiz0             = _mm_setzero_ps();
 192         fix1             = _mm_setzero_ps();
 193         fiy1             = _mm_setzero_ps();
 194         fiz1             = _mm_setzero_ps();
 195         fix2             = _mm_setzero_ps();
 196         fiy2             = _mm_setzero_ps();
 197         fiz2             = _mm_setzero_ps();
 198
 199         /* Reset potential sums */
 200         velecsum         = _mm_setzero_ps();
 201         vvdwsum          = _mm_setzero_ps();
 202
 203         /* Start inner kernel loop */
 204         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 205         {
 206
 207             /* Get j neighbor index, and coordinate index */
 208             jnrA             = jjnr[jidx];
 209             jnrB             = jjnr[jidx+1];
 210             jnrC             = jjnr[jidx+2];
 211             jnrD             = jjnr[jidx+3];
 212             j_coord_offsetA  = DIM*jnrA;
 213             j_coord_offsetB  = DIM*jnrB;
 214             j_coord_offsetC  = DIM*jnrC;
 215             j_coord_offsetD  = DIM*jnrD;
 216
 217             /* load j atom coordinates */
 218             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 219                                               x+j_coord_offsetC,x+j_coord_offsetD,
 220                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 221
 222             /* Calculate displacement vector */
 223             dx00             = _mm_sub_ps(ix0,jx0);
 224             dy00             = _mm_sub_ps(iy0,jy0);
 225             dz00             = _mm_sub_ps(iz0,jz0);
 226             dx01             = _mm_sub_ps(ix0,jx1);
 227             dy01             = _mm_sub_ps(iy0,jy1);
 228             dz01             = _mm_sub_ps(iz0,jz1);
 229             dx02             = _mm_sub_ps(ix0,jx2);
 230             dy02             = _mm_sub_ps(iy0,jy2);
 231             dz02             = _mm_sub_ps(iz0,jz2);
 232             dx10             = _mm_sub_ps(ix1,jx0);
 233             dy10             = _mm_sub_ps(iy1,jy0);
 234             dz10             = _mm_sub_ps(iz1,jz0);
 235             dx11             = _mm_sub_ps(ix1,jx1);
 236             dy11             = _mm_sub_ps(iy1,jy1);
 237             dz11             = _mm_sub_ps(iz1,jz1);
 238             dx12             = _mm_sub_ps(ix1,jx2);
 239             dy12             = _mm_sub_ps(iy1,jy2);
 240             dz12             = _mm_sub_ps(iz1,jz2);
 241             dx20             = _mm_sub_ps(ix2,jx0);
 242             dy20             = _mm_sub_ps(iy2,jy0);
 243             dz20             = _mm_sub_ps(iz2,jz0);
 244             dx21             = _mm_sub_ps(ix2,jx1);
 245             dy21             = _mm_sub_ps(iy2,jy1);
 246             dz21             = _mm_sub_ps(iz2,jz1);
 247             dx22             = _mm_sub_ps(ix2,jx2);
 248             dy22             = _mm_sub_ps(iy2,jy2);
 249             dz22             = _mm_sub_ps(iz2,jz2);
 250
 251             /* Calculate squared distance and things based on it */
 252             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 253             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
 254             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
 255             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
 256             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
 257             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
 258             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
 259             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
 260             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
 261
 262             rinv00           = gmx_mm_invsqrt_ps(rsq00);
 263             rinv01           = gmx_mm_invsqrt_ps(rsq01);
 264             rinv02           = gmx_mm_invsqrt_ps(rsq02);
 265             rinv10           = gmx_mm_invsqrt_ps(rsq10);
 266             rinv11           = gmx_mm_invsqrt_ps(rsq11);
 267             rinv12           = gmx_mm_invsqrt_ps(rsq12);
 268             rinv20           = gmx_mm_invsqrt_ps(rsq20);
 269             rinv21           = gmx_mm_invsqrt_ps(rsq21);
 270             rinv22           = gmx_mm_invsqrt_ps(rsq22);
 271
 272             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 273             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
 274             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
 275             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
 276             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
 277             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
 278             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
 279             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
 280             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
 281
 282             fjx0             = _mm_setzero_ps();
 283             fjy0             = _mm_setzero_ps();
 284             fjz0             = _mm_setzero_ps();
 285             fjx1             = _mm_setzero_ps();
 286             fjy1             = _mm_setzero_ps();
 287             fjz1             = _mm_setzero_ps();
 288             fjx2             = _mm_setzero_ps();
 289             fjy2             = _mm_setzero_ps();
 290             fjz2             = _mm_setzero_ps();
 291
 292             /**************************
 293              * CALCULATE INTERACTIONS *
 294              **************************/
 295
 296             /* COULOMB ELECTROSTATICS */
 297             velec            = _mm_mul_ps(qq00,rinv00);
 298             felec            = _mm_mul_ps(velec,rinvsq00);
 299
 300             /* LENNARD-JONES DISPERSION/REPULSION */
 301
 302             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 303             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
 304             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
 305             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
 306             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
 307
 308             /* Update potential sum for this i atom from the interaction with this j atom. */
 309             velecsum         = _mm_add_ps(velecsum,velec);
 310             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 311
 312             fscal            = _mm_add_ps(felec,fvdw);
 313
 314              /* Update vectorial force */
 315             fix0             = _mm_macc_ps(dx00,fscal,fix0);
 316             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 317             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 318
 319             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
 320             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
 321             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 322
 323             /**************************
 324              * CALCULATE INTERACTIONS *
 325              **************************/
 326
 327             /* COULOMB ELECTROSTATICS */
 328             velec            = _mm_mul_ps(qq01,rinv01);
 329             felec            = _mm_mul_ps(velec,rinvsq01);
 330
 331             /* Update potential sum for this i atom from the interaction with this j atom. */
 332             velecsum         = _mm_add_ps(velecsum,velec);
 333
 334             fscal            = felec;
 335
 336              /* Update vectorial force */
 337             fix0             = _mm_macc_ps(dx01,fscal,fix0);
 338             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
 339             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
 340
 341             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
 342             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
 343             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
 344
 345             /**************************
 346              * CALCULATE INTERACTIONS *
 347              **************************/
 348
 349             /* COULOMB ELECTROSTATICS */
 350             velec            = _mm_mul_ps(qq02,rinv02);
 351             felec            = _mm_mul_ps(velec,rinvsq02);
 352
 353             /* Update potential sum for this i atom from the interaction with this j atom. */
 354             velecsum         = _mm_add_ps(velecsum,velec);
 355
 356             fscal            = felec;
 357
 358              /* Update vectorial force */
 359             fix0             = _mm_macc_ps(dx02,fscal,fix0);
 360             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
 361             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
 362
 363             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
 364             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
 365             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
 366
 367             /**************************
 368              * CALCULATE INTERACTIONS *
 369              **************************/
 370
 371             /* COULOMB ELECTROSTATICS */
 372             velec            = _mm_mul_ps(qq10,rinv10);
 373             felec            = _mm_mul_ps(velec,rinvsq10);
 374
 375             /* Update potential sum for this i atom from the interaction with this j atom. */
 376             velecsum         = _mm_add_ps(velecsum,velec);
 377
 378             fscal            = felec;
 379
 380              /* Update vectorial force */
 381             fix1             = _mm_macc_ps(dx10,fscal,fix1);
 382             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
 383             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 384
 385             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
 386             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
 387             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 388
 389             /**************************
 390              * CALCULATE INTERACTIONS *
 391              **************************/
 392
 393             /* COULOMB ELECTROSTATICS */
 394             velec            = _mm_mul_ps(qq11,rinv11);
 395             felec            = _mm_mul_ps(velec,rinvsq11);
 396
 397             /* Update potential sum for this i atom from the interaction with this j atom. */
 398             velecsum         = _mm_add_ps(velecsum,velec);
 399
 400             fscal            = felec;
 401
 402              /* Update vectorial force */
 403             fix1             = _mm_macc_ps(dx11,fscal,fix1);
 404             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
 405             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
 406
 407             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
 408             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
 409             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
 410
 411             /**************************
 412              * CALCULATE INTERACTIONS *
 413              **************************/
 414
 415             /* COULOMB ELECTROSTATICS */
 416             velec            = _mm_mul_ps(qq12,rinv12);
 417             felec            = _mm_mul_ps(velec,rinvsq12);
 418
 419             /* Update potential sum for this i atom from the interaction with this j atom. */
 420             velecsum         = _mm_add_ps(velecsum,velec);
 421
 422             fscal            = felec;
 423
 424              /* Update vectorial force */
 425             fix1             = _mm_macc_ps(dx12,fscal,fix1);
 426             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
 427             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
 428
 429             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
 430             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
 431             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
 432
 433             /**************************
 434              * CALCULATE INTERACTIONS *
 435              **************************/
 436
 437             /* COULOMB ELECTROSTATICS */
 438             velec            = _mm_mul_ps(qq20,rinv20);
 439             felec            = _mm_mul_ps(velec,rinvsq20);
 440
 441             /* Update potential sum for this i atom from the interaction with this j atom. */
 442             velecsum         = _mm_add_ps(velecsum,velec);
 443
 444             fscal            = felec;
 445
 446              /* Update vectorial force */
 447             fix2             = _mm_macc_ps(dx20,fscal,fix2);
 448             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
 449             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 450
 451             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
 452             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
 453             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 454
 455             /**************************
 456              * CALCULATE INTERACTIONS *
 457              **************************/
 458
 459             /* COULOMB ELECTROSTATICS */
 460             velec            = _mm_mul_ps(qq21,rinv21);
 461             felec            = _mm_mul_ps(velec,rinvsq21);
 462
 463             /* Update potential sum for this i atom from the interaction with this j atom. */
 464             velecsum         = _mm_add_ps(velecsum,velec);
 465
 466             fscal            = felec;
 467
 468              /* Update vectorial force */
 469             fix2             = _mm_macc_ps(dx21,fscal,fix2);
 470             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
 471             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
 472
 473             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
 474             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
 475             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
 476
 477             /**************************
 478              * CALCULATE INTERACTIONS *
 479              **************************/
 480
 481             /* COULOMB ELECTROSTATICS */
 482             velec            = _mm_mul_ps(qq22,rinv22);
 483             felec            = _mm_mul_ps(velec,rinvsq22);
 484
 485             /* Update potential sum for this i atom from the interaction with this j atom. */
 486             velecsum         = _mm_add_ps(velecsum,velec);
 487
 488             fscal            = felec;
 489
 490              /* Update vectorial force */
 491             fix2             = _mm_macc_ps(dx22,fscal,fix2);
 492             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
 493             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
 494
 495             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
 496             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
 497             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
 498
 499             fjptrA             = f+j_coord_offsetA;
 500             fjptrB             = f+j_coord_offsetB;
 501             fjptrC             = f+j_coord_offsetC;
 502             fjptrD             = f+j_coord_offsetD;
 503
 504             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 505                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 506
 507             /* Inner loop uses 291 flops */
 508         }
 509
 510         if(jidx<j_index_end)
 511         {
 512
 513             /* Get j neighbor index, and coordinate index */
 514             jnrlistA         = jjnr[jidx];
 515             jnrlistB         = jjnr[jidx+1];
 516             jnrlistC         = jjnr[jidx+2];
 517             jnrlistD         = jjnr[jidx+3];
 518             /* Sign of each element will be negative for non-real atoms.
 519              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 520              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 521              */
 522             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 523             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 524             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 525             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 526             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 527             j_coord_offsetA  = DIM*jnrA;
 528             j_coord_offsetB  = DIM*jnrB;
 529             j_coord_offsetC  = DIM*jnrC;
 530             j_coord_offsetD  = DIM*jnrD;
 531
 532             /* load j atom coordinates */
 533             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 534                                               x+j_coord_offsetC,x+j_coord_offsetD,
 535                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 536
 537             /* Calculate displacement vector */
 538             dx00             = _mm_sub_ps(ix0,jx0);
 539             dy00             = _mm_sub_ps(iy0,jy0);
 540             dz00             = _mm_sub_ps(iz0,jz0);
 541             dx01             = _mm_sub_ps(ix0,jx1);
 542             dy01             = _mm_sub_ps(iy0,jy1);
 543             dz01             = _mm_sub_ps(iz0,jz1);
 544             dx02             = _mm_sub_ps(ix0,jx2);
 545             dy02             = _mm_sub_ps(iy0,jy2);
 546             dz02             = _mm_sub_ps(iz0,jz2);
 547             dx10             = _mm_sub_ps(ix1,jx0);
 548             dy10             = _mm_sub_ps(iy1,jy0);
 549             dz10             = _mm_sub_ps(iz1,jz0);
 550             dx11             = _mm_sub_ps(ix1,jx1);
 551             dy11             = _mm_sub_ps(iy1,jy1);
 552             dz11             = _mm_sub_ps(iz1,jz1);
 553             dx12             = _mm_sub_ps(ix1,jx2);
 554             dy12             = _mm_sub_ps(iy1,jy2);
 555             dz12             = _mm_sub_ps(iz1,jz2);
 556             dx20             = _mm_sub_ps(ix2,jx0);
 557             dy20             = _mm_sub_ps(iy2,jy0);
 558             dz20             = _mm_sub_ps(iz2,jz0);
 559             dx21             = _mm_sub_ps(ix2,jx1);
 560             dy21             = _mm_sub_ps(iy2,jy1);
 561             dz21             = _mm_sub_ps(iz2,jz1);
 562             dx22             = _mm_sub_ps(ix2,jx2);
 563             dy22             = _mm_sub_ps(iy2,jy2);
 564             dz22             = _mm_sub_ps(iz2,jz2);
 565
 566             /* Calculate squared distance and things based on it */
 567             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 568             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
 569             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
 570             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
 571             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
 572             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
 573             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
 574             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
 575             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
 576
 577             rinv00           = gmx_mm_invsqrt_ps(rsq00);
 578             rinv01           = gmx_mm_invsqrt_ps(rsq01);
 579             rinv02           = gmx_mm_invsqrt_ps(rsq02);
 580             rinv10           = gmx_mm_invsqrt_ps(rsq10);
 581             rinv11           = gmx_mm_invsqrt_ps(rsq11);
 582             rinv12           = gmx_mm_invsqrt_ps(rsq12);
 583             rinv20           = gmx_mm_invsqrt_ps(rsq20);
 584             rinv21           = gmx_mm_invsqrt_ps(rsq21);
 585             rinv22           = gmx_mm_invsqrt_ps(rsq22);
 586
 587             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 588             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
 589             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
 590             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
 591             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
 592             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
 593             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
 594             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
 595             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
 596
 597             fjx0             = _mm_setzero_ps();
 598             fjy0             = _mm_setzero_ps();
 599             fjz0             = _mm_setzero_ps();
 600             fjx1             = _mm_setzero_ps();
 601             fjy1             = _mm_setzero_ps();
 602             fjz1             = _mm_setzero_ps();
 603             fjx2             = _mm_setzero_ps();
 604             fjy2             = _mm_setzero_ps();
 605             fjz2             = _mm_setzero_ps();
 606
 607             /**************************
 608              * CALCULATE INTERACTIONS *
 609              **************************/
 610
 611             /* COULOMB ELECTROSTATICS */
 612             velec            = _mm_mul_ps(qq00,rinv00);
 613             felec            = _mm_mul_ps(velec,rinvsq00);
 614
 615             /* LENNARD-JONES DISPERSION/REPULSION */
 616
 617             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
 618             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
 619             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
 620             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
 621             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
 622
 623             /* Update potential sum for this i atom from the interaction with this j atom. */
 624             velec            = _mm_andnot_ps(dummy_mask,velec);
 625             velecsum         = _mm_add_ps(velecsum,velec);
 626             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
 627             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 628
 629             fscal            = _mm_add_ps(felec,fvdw);
 630
 631             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 632
 633              /* Update vectorial force */
 634             fix0             = _mm_macc_ps(dx00,fscal,fix0);
 635             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 636             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 637
 638             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
 639             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
 640             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 641
 642             /**************************
 643              * CALCULATE INTERACTIONS *
 644              **************************/
 645
 646             /* COULOMB ELECTROSTATICS */
 647             velec            = _mm_mul_ps(qq01,rinv01);
 648             felec            = _mm_mul_ps(velec,rinvsq01);
 649
 650             /* Update potential sum for this i atom from the interaction with this j atom. */
 651             velec            = _mm_andnot_ps(dummy_mask,velec);
 652             velecsum         = _mm_add_ps(velecsum,velec);
 653
 654             fscal            = felec;
 655
 656             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 657
 658              /* Update vectorial force */
 659             fix0             = _mm_macc_ps(dx01,fscal,fix0);
 660             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
 661             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
 662
 663             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
 664             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
 665             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
 666
 667             /**************************
 668              * CALCULATE INTERACTIONS *
 669              **************************/
 670
 671             /* COULOMB ELECTROSTATICS */
 672             velec            = _mm_mul_ps(qq02,rinv02);
 673             felec            = _mm_mul_ps(velec,rinvsq02);
 674
 675             /* Update potential sum for this i atom from the interaction with this j atom. */
 676             velec            = _mm_andnot_ps(dummy_mask,velec);
 677             velecsum         = _mm_add_ps(velecsum,velec);
 678
 679             fscal            = felec;
 680
 681             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 682
 683              /* Update vectorial force */
 684             fix0             = _mm_macc_ps(dx02,fscal,fix0);
 685             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
 686             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
 687
 688             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
 689             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
 690             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
 691
 692             /**************************
 693              * CALCULATE INTERACTIONS *
 694              **************************/
 695
 696             /* COULOMB ELECTROSTATICS */
 697             velec            = _mm_mul_ps(qq10,rinv10);
 698             felec            = _mm_mul_ps(velec,rinvsq10);
 699
 700             /* Update potential sum for this i atom from the interaction with this j atom. */
 701             velec            = _mm_andnot_ps(dummy_mask,velec);
 702             velecsum         = _mm_add_ps(velecsum,velec);
 703
 704             fscal            = felec;
 705
 706             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 707
 708              /* Update vectorial force */
 709             fix1             = _mm_macc_ps(dx10,fscal,fix1);
 710             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
 711             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 712
 713             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
 714             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
 715             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 716
 717             /**************************
 718              * CALCULATE INTERACTIONS *
 719              **************************/
 720
 721             /* COULOMB ELECTROSTATICS */
 722             velec            = _mm_mul_ps(qq11,rinv11);
 723             felec            = _mm_mul_ps(velec,rinvsq11);
 724
 725             /* Update potential sum for this i atom from the interaction with this j atom. */
 726             velec            = _mm_andnot_ps(dummy_mask,velec);
 727             velecsum         = _mm_add_ps(velecsum,velec);
 728
 729             fscal            = felec;
 730
 731             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 732
 733              /* Update vectorial force */
 734             fix1             = _mm_macc_ps(dx11,fscal,fix1);
 735             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
 736             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
 737
 738             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
 739             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
 740             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
 741
 742             /**************************
 743              * CALCULATE INTERACTIONS *
 744              **************************/
 745
 746             /* COULOMB ELECTROSTATICS */
 747             velec            = _mm_mul_ps(qq12,rinv12);
 748             felec            = _mm_mul_ps(velec,rinvsq12);
 749
 750             /* Update potential sum for this i atom from the interaction with this j atom. */
 751             velec            = _mm_andnot_ps(dummy_mask,velec);
 752             velecsum         = _mm_add_ps(velecsum,velec);
 753
 754             fscal            = felec;
 755
 756             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 757
 758              /* Update vectorial force */
 759             fix1             = _mm_macc_ps(dx12,fscal,fix1);
 760             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
 761             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
 762
 763             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
 764             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
 765             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
 766
 767             /**************************
 768              * CALCULATE INTERACTIONS *
 769              **************************/
 770
 771             /* COULOMB ELECTROSTATICS */
 772             velec            = _mm_mul_ps(qq20,rinv20);
 773             felec            = _mm_mul_ps(velec,rinvsq20);
 774
 775             /* Update potential sum for this i atom from the interaction with this j atom. */
 776             velec            = _mm_andnot_ps(dummy_mask,velec);
 777             velecsum         = _mm_add_ps(velecsum,velec);
 778
 779             fscal            = felec;
 780
 781             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 782
 783              /* Update vectorial force */
 784             fix2             = _mm_macc_ps(dx20,fscal,fix2);
 785             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
 786             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 787
 788             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
 789             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
 790             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 791
 792             /**************************
 793              * CALCULATE INTERACTIONS *
 794              **************************/
 795
 796             /* COULOMB ELECTROSTATICS */
 797             velec            = _mm_mul_ps(qq21,rinv21);
 798             felec            = _mm_mul_ps(velec,rinvsq21);
 799
 800             /* Update potential sum for this i atom from the interaction with this j atom. */
 801             velec            = _mm_andnot_ps(dummy_mask,velec);
 802             velecsum         = _mm_add_ps(velecsum,velec);
 803
 804             fscal            = felec;
 805
 806             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 807
 808              /* Update vectorial force */
 809             fix2             = _mm_macc_ps(dx21,fscal,fix2);
 810             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
 811             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
 812
 813             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
 814             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
 815             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
 816
 817             /**************************
 818              * CALCULATE INTERACTIONS *
 819              **************************/
 820
 821             /* COULOMB ELECTROSTATICS */
 822             velec            = _mm_mul_ps(qq22,rinv22);
 823             felec            = _mm_mul_ps(velec,rinvsq22);
 824
 825             /* Update potential sum for this i atom from the interaction with this j atom. */
 826             velec            = _mm_andnot_ps(dummy_mask,velec);
 827             velecsum         = _mm_add_ps(velecsum,velec);
 828
 829             fscal            = felec;
 830
 831             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 832
 833              /* Update vectorial force */
 834             fix2             = _mm_macc_ps(dx22,fscal,fix2);
 835             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
 836             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
 837
 838             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
 839             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
 840             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
 841
 842             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 843             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 844             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 845             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 846
 847             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 848                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 849
 850             /* Inner loop uses 291 flops */
 851         }
 852
 853         /* End of innermost loop */
 854
 855         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
 856                                               f+i_coord_offset,fshift+i_shift_offset);
 857
 858         ggid                        = gid[iidx];
 859         /* Update potential energies */
 860         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 861         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
 862
 863         /* Increment number of inner iterations */
 864         inneriter                  += j_index_end - j_index_start;
 865
 866         /* Outer loop uses 20 flops */
 867     }
 868
 869     /* Increment number of outer iterations */
 870     outeriter        += nri;
 871
 872     /* Update outer/inner flops */
 873
 874     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*291);
 875 }
 876 /*
 877  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_128_fma_single
 878  * Electrostatics interaction: Coulomb
 879  * VdW interaction:            LennardJones
 880  * Geometry:                   Water3-Water3
 881  * Calculate force/pot:        Force
 882  */
 883 void
 884 nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_128_fma_single
 885                     (t_nblist                    * gmx_restrict       nlist,
 886                      rvec                        * gmx_restrict          xx,
 887                      rvec                        * gmx_restrict          ff,
 888                      t_forcerec                  * gmx_restrict          fr,
 889                      t_mdatoms                   * gmx_restrict     mdatoms,
 890                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
 891                      t_nrnb                      * gmx_restrict        nrnb)
 892 {
 893     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 894      * just 0 for non-waters.
 895      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
 896      * jnr indices corresponding to data put in the four positions in the SIMD register.
 897      */
 898     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 899     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 900     int              jnrA,jnrB,jnrC,jnrD;
 901     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 902     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 903     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 904     real             rcutoff_scalar;
 905     real             *shiftvec,*fshift,*x,*f;
 906     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 907     real             scratch[4*DIM];
 908     __m128           fscal,rcutoff,rcutoff2,jidxall;
 909     int              vdwioffset0;
 910     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 911     int              vdwioffset1;
 912     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
 913     int              vdwioffset2;
 914     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
 915     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 916     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 917     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
 918     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
 919     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
 920     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
 921     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 922     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
 923     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
 924     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
 925     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
 926     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
 927     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
 928     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
 929     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
 930     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 931     real             *charge;
 932     int              nvdwtype;
 933     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 934     int              *vdwtype;
 935     real             *vdwparam;
 936     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 937     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 938     __m128           dummy_mask,cutoff_mask;
 939     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 940     __m128           one     = _mm_set1_ps(1.0);
 941     __m128           two     = _mm_set1_ps(2.0);
 942     x                = xx[0];
 943     f                = ff[0];
 944
 945     nri              = nlist->nri;
 946     iinr             = nlist->iinr;
 947     jindex           = nlist->jindex;
 948     jjnr             = nlist->jjnr;
 949     shiftidx         = nlist->shift;
 950     gid              = nlist->gid;
 951     shiftvec         = fr->shift_vec[0];
 952     fshift           = fr->fshift[0];
 953     facel            = _mm_set1_ps(fr->epsfac);
 954     charge           = mdatoms->chargeA;
 955     nvdwtype         = fr->ntype;
 956     vdwparam         = fr->nbfp;
 957     vdwtype          = mdatoms->typeA;
 958
 959     /* Setup water-specific parameters */
 960     inr              = nlist->iinr[0];
 961     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
 962     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
 963     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
 964     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 965
 966     jq0              = _mm_set1_ps(charge[inr+0]);
 967     jq1              = _mm_set1_ps(charge[inr+1]);
 968     jq2              = _mm_set1_ps(charge[inr+2]);
 969     vdwjidx0A        = 2*vdwtype[inr+0];
 970     qq00             = _mm_mul_ps(iq0,jq0);
 971     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
 972     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
 973     qq01             = _mm_mul_ps(iq0,jq1);
 974     qq02             = _mm_mul_ps(iq0,jq2);
 975     qq10             = _mm_mul_ps(iq1,jq0);
 976     qq11             = _mm_mul_ps(iq1,jq1);
 977     qq12             = _mm_mul_ps(iq1,jq2);
 978     qq20             = _mm_mul_ps(iq2,jq0);
 979     qq21             = _mm_mul_ps(iq2,jq1);
 980     qq22             = _mm_mul_ps(iq2,jq2);
 981
 982     /* Avoid stupid compiler warnings */
 983     jnrA = jnrB = jnrC = jnrD = 0;
 984     j_coord_offsetA = 0;
 985     j_coord_offsetB = 0;
 986     j_coord_offsetC = 0;
 987     j_coord_offsetD = 0;
 988
 989     outeriter        = 0;
 990     inneriter        = 0;
 991
 992     for(iidx=0;iidx<4*DIM;iidx++)
 993     {
 994         scratch[iidx] = 0.0;
 995     }
 996
 997     /* Start outer loop over neighborlists */
 998     for(iidx=0; iidx<nri; iidx++)
 999     {
1000         /* Load shift vector for this list */
1001         i_shift_offset   = DIM*shiftidx[iidx];
1002
1003         /* Load limits for loop over neighbors */
1004         j_index_start    = jindex[iidx];
1005         j_index_end      = jindex[iidx+1];
1006
1007         /* Get outer coordinate index */
1008         inr              = iinr[iidx];
1009         i_coord_offset   = DIM*inr;
1010
1011         /* Load i particle coords and add shift vector */
1012         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1013                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1014
1015         fix0             = _mm_setzero_ps();
1016         fiy0             = _mm_setzero_ps();
1017         fiz0             = _mm_setzero_ps();
1018         fix1             = _mm_setzero_ps();
1019         fiy1             = _mm_setzero_ps();
1020         fiz1             = _mm_setzero_ps();
1021         fix2             = _mm_setzero_ps();
1022         fiy2             = _mm_setzero_ps();
1023         fiz2             = _mm_setzero_ps();
1024
1025         /* Start inner kernel loop */
1026         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1027         {
1028
1029             /* Get j neighbor index, and coordinate index */
1030             jnrA             = jjnr[jidx];
1031             jnrB             = jjnr[jidx+1];
1032             jnrC             = jjnr[jidx+2];
1033             jnrD             = jjnr[jidx+3];
1034             j_coord_offsetA  = DIM*jnrA;
1035             j_coord_offsetB  = DIM*jnrB;
1036             j_coord_offsetC  = DIM*jnrC;
1037             j_coord_offsetD  = DIM*jnrD;
1038
1039             /* load j atom coordinates */
1040             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1041                                               x+j_coord_offsetC,x+j_coord_offsetD,
1042                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1043
1044             /* Calculate displacement vector */
1045             dx00             = _mm_sub_ps(ix0,jx0);
1046             dy00             = _mm_sub_ps(iy0,jy0);
1047             dz00             = _mm_sub_ps(iz0,jz0);
1048             dx01             = _mm_sub_ps(ix0,jx1);
1049             dy01             = _mm_sub_ps(iy0,jy1);
1050             dz01             = _mm_sub_ps(iz0,jz1);
1051             dx02             = _mm_sub_ps(ix0,jx2);
1052             dy02             = _mm_sub_ps(iy0,jy2);
1053             dz02             = _mm_sub_ps(iz0,jz2);
1054             dx10             = _mm_sub_ps(ix1,jx0);
1055             dy10             = _mm_sub_ps(iy1,jy0);
1056             dz10             = _mm_sub_ps(iz1,jz0);
1057             dx11             = _mm_sub_ps(ix1,jx1);
1058             dy11             = _mm_sub_ps(iy1,jy1);
1059             dz11             = _mm_sub_ps(iz1,jz1);
1060             dx12             = _mm_sub_ps(ix1,jx2);
1061             dy12             = _mm_sub_ps(iy1,jy2);
1062             dz12             = _mm_sub_ps(iz1,jz2);
1063             dx20             = _mm_sub_ps(ix2,jx0);
1064             dy20             = _mm_sub_ps(iy2,jy0);
1065             dz20             = _mm_sub_ps(iz2,jz0);
1066             dx21             = _mm_sub_ps(ix2,jx1);
1067             dy21             = _mm_sub_ps(iy2,jy1);
1068             dz21             = _mm_sub_ps(iz2,jz1);
1069             dx22             = _mm_sub_ps(ix2,jx2);
1070             dy22             = _mm_sub_ps(iy2,jy2);
1071             dz22             = _mm_sub_ps(iz2,jz2);
1072
1073             /* Calculate squared distance and things based on it */
1074             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1075             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1076             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1077             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1078             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1079             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1080             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1081             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1082             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1083
1084             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1085             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1086             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1087             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1088             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1089             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1090             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1091             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1092             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1093
1094             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1095             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1096             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1097             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1098             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1099             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1100             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1101             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1102             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1103
1104             fjx0             = _mm_setzero_ps();
1105             fjy0             = _mm_setzero_ps();
1106             fjz0             = _mm_setzero_ps();
1107             fjx1             = _mm_setzero_ps();
1108             fjy1             = _mm_setzero_ps();
1109             fjz1             = _mm_setzero_ps();
1110             fjx2             = _mm_setzero_ps();
1111             fjy2             = _mm_setzero_ps();
1112             fjz2             = _mm_setzero_ps();
1113
1114             /**************************
1115              * CALCULATE INTERACTIONS *
1116              **************************/
1117
1118             /* COULOMB ELECTROSTATICS */
1119             velec            = _mm_mul_ps(qq00,rinv00);
1120             felec            = _mm_mul_ps(velec,rinvsq00);
1121
1122             /* LENNARD-JONES DISPERSION/REPULSION */
1123
1124             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1125             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1126
1127             fscal            = _mm_add_ps(felec,fvdw);
1128
1129              /* Update vectorial force */
1130             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1131             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1132             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1133
1134             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1135             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1136             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1137
1138             /**************************
1139              * CALCULATE INTERACTIONS *
1140              **************************/
1141
1142             /* COULOMB ELECTROSTATICS */
1143             velec            = _mm_mul_ps(qq01,rinv01);
1144             felec            = _mm_mul_ps(velec,rinvsq01);
1145
1146             fscal            = felec;
1147
1148              /* Update vectorial force */
1149             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1150             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1151             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1152
1153             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1154             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1155             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1156
1157             /**************************
1158              * CALCULATE INTERACTIONS *
1159              **************************/
1160
1161             /* COULOMB ELECTROSTATICS */
1162             velec            = _mm_mul_ps(qq02,rinv02);
1163             felec            = _mm_mul_ps(velec,rinvsq02);
1164
1165             fscal            = felec;
1166
1167              /* Update vectorial force */
1168             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1169             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1170             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1171
1172             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1173             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1174             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1175
1176             /**************************
1177              * CALCULATE INTERACTIONS *
1178              **************************/
1179
1180             /* COULOMB ELECTROSTATICS */
1181             velec            = _mm_mul_ps(qq10,rinv10);
1182             felec            = _mm_mul_ps(velec,rinvsq10);
1183
1184             fscal            = felec;
1185
1186              /* Update vectorial force */
1187             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1188             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1189             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1190
1191             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1192             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1193             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1194
1195             /**************************
1196              * CALCULATE INTERACTIONS *
1197              **************************/
1198
1199             /* COULOMB ELECTROSTATICS */
1200             velec            = _mm_mul_ps(qq11,rinv11);
1201             felec            = _mm_mul_ps(velec,rinvsq11);
1202
1203             fscal            = felec;
1204
1205              /* Update vectorial force */
1206             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1207             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1208             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1209
1210             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1211             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1212             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1213
1214             /**************************
1215              * CALCULATE INTERACTIONS *
1216              **************************/
1217
1218             /* COULOMB ELECTROSTATICS */
1219             velec            = _mm_mul_ps(qq12,rinv12);
1220             felec            = _mm_mul_ps(velec,rinvsq12);
1221
1222             fscal            = felec;
1223
1224              /* Update vectorial force */
1225             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1226             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1227             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1228
1229             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1230             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1231             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1232
1233             /**************************
1234              * CALCULATE INTERACTIONS *
1235              **************************/
1236
1237             /* COULOMB ELECTROSTATICS */
1238             velec            = _mm_mul_ps(qq20,rinv20);
1239             felec            = _mm_mul_ps(velec,rinvsq20);
1240
1241             fscal            = felec;
1242
1243              /* Update vectorial force */
1244             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1245             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1246             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1247
1248             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1249             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1250             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1251
1252             /**************************
1253              * CALCULATE INTERACTIONS *
1254              **************************/
1255
1256             /* COULOMB ELECTROSTATICS */
1257             velec            = _mm_mul_ps(qq21,rinv21);
1258             felec            = _mm_mul_ps(velec,rinvsq21);
1259
1260             fscal            = felec;
1261
1262              /* Update vectorial force */
1263             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1264             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1265             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1266
1267             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1268             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1269             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1270
1271             /**************************
1272              * CALCULATE INTERACTIONS *
1273              **************************/
1274
1275             /* COULOMB ELECTROSTATICS */
1276             velec            = _mm_mul_ps(qq22,rinv22);
1277             felec            = _mm_mul_ps(velec,rinvsq22);
1278
1279             fscal            = felec;
1280
1281              /* Update vectorial force */
1282             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1283             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1284             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1285
1286             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1287             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1288             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1289
1290             fjptrA             = f+j_coord_offsetA;
1291             fjptrB             = f+j_coord_offsetB;
1292             fjptrC             = f+j_coord_offsetC;
1293             fjptrD             = f+j_coord_offsetD;
1294
1295             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1296                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1297
1298             /* Inner loop uses 277 flops */
1299         }
1300
1301         if(jidx<j_index_end)
1302         {
1303
1304             /* Get j neighbor index, and coordinate index */
1305             jnrlistA         = jjnr[jidx];
1306             jnrlistB         = jjnr[jidx+1];
1307             jnrlistC         = jjnr[jidx+2];
1308             jnrlistD         = jjnr[jidx+3];
1309             /* Sign of each element will be negative for non-real atoms.
1310              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1311              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1312              */
1313             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1314             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1315             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1316             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1317             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1318             j_coord_offsetA  = DIM*jnrA;
1319             j_coord_offsetB  = DIM*jnrB;
1320             j_coord_offsetC  = DIM*jnrC;
1321             j_coord_offsetD  = DIM*jnrD;
1322
1323             /* load j atom coordinates */
1324             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1325                                               x+j_coord_offsetC,x+j_coord_offsetD,
1326                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1327
1328             /* Calculate displacement vector */
1329             dx00             = _mm_sub_ps(ix0,jx0);
1330             dy00             = _mm_sub_ps(iy0,jy0);
1331             dz00             = _mm_sub_ps(iz0,jz0);
1332             dx01             = _mm_sub_ps(ix0,jx1);
1333             dy01             = _mm_sub_ps(iy0,jy1);
1334             dz01             = _mm_sub_ps(iz0,jz1);
1335             dx02             = _mm_sub_ps(ix0,jx2);
1336             dy02             = _mm_sub_ps(iy0,jy2);
1337             dz02             = _mm_sub_ps(iz0,jz2);
1338             dx10             = _mm_sub_ps(ix1,jx0);
1339             dy10             = _mm_sub_ps(iy1,jy0);
1340             dz10             = _mm_sub_ps(iz1,jz0);
1341             dx11             = _mm_sub_ps(ix1,jx1);
1342             dy11             = _mm_sub_ps(iy1,jy1);
1343             dz11             = _mm_sub_ps(iz1,jz1);
1344             dx12             = _mm_sub_ps(ix1,jx2);
1345             dy12             = _mm_sub_ps(iy1,jy2);
1346             dz12             = _mm_sub_ps(iz1,jz2);
1347             dx20             = _mm_sub_ps(ix2,jx0);
1348             dy20             = _mm_sub_ps(iy2,jy0);
1349             dz20             = _mm_sub_ps(iz2,jz0);
1350             dx21             = _mm_sub_ps(ix2,jx1);
1351             dy21             = _mm_sub_ps(iy2,jy1);
1352             dz21             = _mm_sub_ps(iz2,jz1);
1353             dx22             = _mm_sub_ps(ix2,jx2);
1354             dy22             = _mm_sub_ps(iy2,jy2);
1355             dz22             = _mm_sub_ps(iz2,jz2);
1356
1357             /* Calculate squared distance and things based on it */
1358             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1359             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1360             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1361             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1362             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1363             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1364             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1365             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1366             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1367
1368             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1369             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1370             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1371             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1372             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1373             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1374             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1375             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1376             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1377
1378             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1379             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1380             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1381             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1382             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1383             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1384             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1385             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1386             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1387
1388             fjx0             = _mm_setzero_ps();
1389             fjy0             = _mm_setzero_ps();
1390             fjz0             = _mm_setzero_ps();
1391             fjx1             = _mm_setzero_ps();
1392             fjy1             = _mm_setzero_ps();
1393             fjz1             = _mm_setzero_ps();
1394             fjx2             = _mm_setzero_ps();
1395             fjy2             = _mm_setzero_ps();
1396             fjz2             = _mm_setzero_ps();
1397
1398             /**************************
1399              * CALCULATE INTERACTIONS *
1400              **************************/
1401
1402             /* COULOMB ELECTROSTATICS */
1403             velec            = _mm_mul_ps(qq00,rinv00);
1404             felec            = _mm_mul_ps(velec,rinvsq00);
1405
1406             /* LENNARD-JONES DISPERSION/REPULSION */
1407
1408             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1409             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1410
1411             fscal            = _mm_add_ps(felec,fvdw);
1412
1413             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1414
1415              /* Update vectorial force */
1416             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1417             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1418             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1419
1420             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1421             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1422             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1423
1424             /**************************
1425              * CALCULATE INTERACTIONS *
1426              **************************/
1427
1428             /* COULOMB ELECTROSTATICS */
1429             velec            = _mm_mul_ps(qq01,rinv01);
1430             felec            = _mm_mul_ps(velec,rinvsq01);
1431
1432             fscal            = felec;
1433
1434             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1435
1436              /* Update vectorial force */
1437             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1438             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1439             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1440
1441             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1442             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1443             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1444
1445             /**************************
1446              * CALCULATE INTERACTIONS *
1447              **************************/
1448
1449             /* COULOMB ELECTROSTATICS */
1450             velec            = _mm_mul_ps(qq02,rinv02);
1451             felec            = _mm_mul_ps(velec,rinvsq02);
1452
1453             fscal            = felec;
1454
1455             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1456
1457              /* Update vectorial force */
1458             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1459             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1460             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1461
1462             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1463             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1464             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1465
1466             /**************************
1467              * CALCULATE INTERACTIONS *
1468              **************************/
1469
1470             /* COULOMB ELECTROSTATICS */
1471             velec            = _mm_mul_ps(qq10,rinv10);
1472             felec            = _mm_mul_ps(velec,rinvsq10);
1473
1474             fscal            = felec;
1475
1476             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1477
1478              /* Update vectorial force */
1479             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1480             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1481             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1482
1483             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1484             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1485             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1486
1487             /**************************
1488              * CALCULATE INTERACTIONS *
1489              **************************/
1490
1491             /* COULOMB ELECTROSTATICS */
1492             velec            = _mm_mul_ps(qq11,rinv11);
1493             felec            = _mm_mul_ps(velec,rinvsq11);
1494
1495             fscal            = felec;
1496
1497             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1498
1499              /* Update vectorial force */
1500             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1501             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1502             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1503
1504             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1505             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1506             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1507
1508             /**************************
1509              * CALCULATE INTERACTIONS *
1510              **************************/
1511
1512             /* COULOMB ELECTROSTATICS */
1513             velec            = _mm_mul_ps(qq12,rinv12);
1514             felec            = _mm_mul_ps(velec,rinvsq12);
1515
1516             fscal            = felec;
1517
1518             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1519
1520              /* Update vectorial force */
1521             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1522             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1523             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1524
1525             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1526             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1527             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1528
1529             /**************************
1530              * CALCULATE INTERACTIONS *
1531              **************************/
1532
1533             /* COULOMB ELECTROSTATICS */
1534             velec            = _mm_mul_ps(qq20,rinv20);
1535             felec            = _mm_mul_ps(velec,rinvsq20);
1536
1537             fscal            = felec;
1538
1539             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1540
1541              /* Update vectorial force */
1542             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1543             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1544             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1545
1546             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1547             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1548             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1549
1550             /**************************
1551              * CALCULATE INTERACTIONS *
1552              **************************/
1553
1554             /* COULOMB ELECTROSTATICS */
1555             velec            = _mm_mul_ps(qq21,rinv21);
1556             felec            = _mm_mul_ps(velec,rinvsq21);
1557
1558             fscal            = felec;
1559
1560             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1561
1562              /* Update vectorial force */
1563             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1564             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1565             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1566
1567             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1568             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1569             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1570
1571             /**************************
1572              * CALCULATE INTERACTIONS *
1573              **************************/
1574
1575             /* COULOMB ELECTROSTATICS */
1576             velec            = _mm_mul_ps(qq22,rinv22);
1577             felec            = _mm_mul_ps(velec,rinvsq22);
1578
1579             fscal            = felec;
1580
1581             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1582
1583              /* Update vectorial force */
1584             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1585             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1586             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1587
1588             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1589             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1590             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1591
1592             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1593             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1594             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1595             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1596
1597             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1598                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1599
1600             /* Inner loop uses 277 flops */
1601         }
1602
1603         /* End of innermost loop */
1604
1605         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1606                                               f+i_coord_offset,fshift+i_shift_offset);
1607
1608         /* Increment number of inner iterations */
1609         inneriter                  += j_index_end - j_index_start;
1610
1611         /* Outer loop uses 18 flops */
1612     }
1613
1614     /* Increment number of outer iterations */
1615     outeriter        += nri;
1616
1617     /* Update outer/inner flops */
1618
1619     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*277);
1620 }