src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_avx_128_fma_single.c

   1 /*
   2  * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
   3  *
   4  *                This source code is part of
   5  *
   6  *                 G   R   O   M   A   C   S
   7  *
   8  * Copyright (c) 2001-2012, The GROMACS Development Team
   9  *
  10  * Gromacs is a library for molecular simulation and trajectory analysis,
  11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  12  * a full list of developers and information, check out http://www.gromacs.org
  13  *
  14  * This program is free software; you can redistribute it and/or modify it under
  15  * the terms of the GNU Lesser General Public License as published by the Free
  16  * Software Foundation; either version 2 of the License, or (at your option) any
  17  * later version.
  18  *
  19  * To help fund GROMACS development, we humbly ask that you cite
  20  * the papers people have written on it - you can find them on the website.
  21  */
  22 #ifdef HAVE_CONFIG_H
  23 #include <config.h>
  24 #endif
  25
  26 #include <math.h>
  27
  28 #include "../nb_kernel.h"
  29 #include "types/simple.h"
  30 #include "vec.h"
  31 #include "nrnb.h"
  32
  33 #include "gmx_math_x86_avx_128_fma_single.h"
  34 #include "kernelutil_x86_avx_128_fma_single.h"
  35
  36 /*
  37  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_128_fma_single
  38  * Electrostatics interaction: Coulomb
  39  * VdW interaction:            CubicSplineTable
  40  * Geometry:                   Water3-Water3
  41  * Calculate force/pot:        PotentialAndForce
  42  */
  43 void
  44 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_128_fma_single
  45                     (t_nblist * gmx_restrict                nlist,
  46                      rvec * gmx_restrict                    xx,
  47                      rvec * gmx_restrict                    ff,
  48                      t_forcerec * gmx_restrict              fr,
  49                      t_mdatoms * gmx_restrict               mdatoms,
  50                      nb_kernel_data_t * gmx_restrict        kernel_data,
  51                      t_nrnb * gmx_restrict                  nrnb)
  52 {
  53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
  54      * just 0 for non-waters.
  55      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
  56      * jnr indices corresponding to data put in the four positions in the SIMD register.
  57      */
  58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
  59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
  60     int              jnrA,jnrB,jnrC,jnrD;
  61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
  62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
  63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
  64     real             rcutoff_scalar;
  65     real             *shiftvec,*fshift,*x,*f;
  66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
  67     real             scratch[4*DIM];
  68     __m128           fscal,rcutoff,rcutoff2,jidxall;
  69     int              vdwioffset0;
  70     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
  71     int              vdwioffset1;
  72     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
  73     int              vdwioffset2;
  74     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
  75     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
  76     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
  77     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
  78     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
  79     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
  80     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
  81     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
  82     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
  83     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
  84     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
  85     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
  86     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
  87     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
  88     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
  89     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
  90     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
  91     real             *charge;
  92     int              nvdwtype;
  93     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
  94     int              *vdwtype;
  95     real             *vdwparam;
  96     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
  97     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
  98     __m128i          vfitab;
  99     __m128i          ifour       = _mm_set1_epi32(4);
 100     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
 101     real             *vftab;
 102     __m128           dummy_mask,cutoff_mask;
 103     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 104     __m128           one     = _mm_set1_ps(1.0);
 105     __m128           two     = _mm_set1_ps(2.0);
 106     x                = xx[0];
 107     f                = ff[0];
 108
 109     nri              = nlist->nri;
 110     iinr             = nlist->iinr;
 111     jindex           = nlist->jindex;
 112     jjnr             = nlist->jjnr;
 113     shiftidx         = nlist->shift;
 114     gid              = nlist->gid;
 115     shiftvec         = fr->shift_vec[0];
 116     fshift           = fr->fshift[0];
 117     facel            = _mm_set1_ps(fr->epsfac);
 118     charge           = mdatoms->chargeA;
 119     nvdwtype         = fr->ntype;
 120     vdwparam         = fr->nbfp;
 121     vdwtype          = mdatoms->typeA;
 122
 123     vftab            = kernel_data->table_vdw->data;
 124     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
 125
 126     /* Setup water-specific parameters */
 127     inr              = nlist->iinr[0];
 128     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
 129     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
 130     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
 131     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
 132
 133     jq0              = _mm_set1_ps(charge[inr+0]);
 134     jq1              = _mm_set1_ps(charge[inr+1]);
 135     jq2              = _mm_set1_ps(charge[inr+2]);
 136     vdwjidx0A        = 2*vdwtype[inr+0];
 137     qq00             = _mm_mul_ps(iq0,jq0);
 138     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
 139     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
 140     qq01             = _mm_mul_ps(iq0,jq1);
 141     qq02             = _mm_mul_ps(iq0,jq2);
 142     qq10             = _mm_mul_ps(iq1,jq0);
 143     qq11             = _mm_mul_ps(iq1,jq1);
 144     qq12             = _mm_mul_ps(iq1,jq2);
 145     qq20             = _mm_mul_ps(iq2,jq0);
 146     qq21             = _mm_mul_ps(iq2,jq1);
 147     qq22             = _mm_mul_ps(iq2,jq2);
 148
 149     /* Avoid stupid compiler warnings */
 150     jnrA = jnrB = jnrC = jnrD = 0;
 151     j_coord_offsetA = 0;
 152     j_coord_offsetB = 0;
 153     j_coord_offsetC = 0;
 154     j_coord_offsetD = 0;
 155
 156     outeriter        = 0;
 157     inneriter        = 0;
 158
 159     for(iidx=0;iidx<4*DIM;iidx++)
 160     {
 161         scratch[iidx] = 0.0;
 162     }
 163
 164     /* Start outer loop over neighborlists */
 165     for(iidx=0; iidx<nri; iidx++)
 166     {
 167         /* Load shift vector for this list */
 168         i_shift_offset   = DIM*shiftidx[iidx];
 169
 170         /* Load limits for loop over neighbors */
 171         j_index_start    = jindex[iidx];
 172         j_index_end      = jindex[iidx+1];
 173
 174         /* Get outer coordinate index */
 175         inr              = iinr[iidx];
 176         i_coord_offset   = DIM*inr;
 177
 178         /* Load i particle coords and add shift vector */
 179         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
 180                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
 181
 182         fix0             = _mm_setzero_ps();
 183         fiy0             = _mm_setzero_ps();
 184         fiz0             = _mm_setzero_ps();
 185         fix1             = _mm_setzero_ps();
 186         fiy1             = _mm_setzero_ps();
 187         fiz1             = _mm_setzero_ps();
 188         fix2             = _mm_setzero_ps();
 189         fiy2             = _mm_setzero_ps();
 190         fiz2             = _mm_setzero_ps();
 191
 192         /* Reset potential sums */
 193         velecsum         = _mm_setzero_ps();
 194         vvdwsum          = _mm_setzero_ps();
 195
 196         /* Start inner kernel loop */
 197         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 198         {
 199
 200             /* Get j neighbor index, and coordinate index */
 201             jnrA             = jjnr[jidx];
 202             jnrB             = jjnr[jidx+1];
 203             jnrC             = jjnr[jidx+2];
 204             jnrD             = jjnr[jidx+3];
 205             j_coord_offsetA  = DIM*jnrA;
 206             j_coord_offsetB  = DIM*jnrB;
 207             j_coord_offsetC  = DIM*jnrC;
 208             j_coord_offsetD  = DIM*jnrD;
 209
 210             /* load j atom coordinates */
 211             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 212                                               x+j_coord_offsetC,x+j_coord_offsetD,
 213                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 214
 215             /* Calculate displacement vector */
 216             dx00             = _mm_sub_ps(ix0,jx0);
 217             dy00             = _mm_sub_ps(iy0,jy0);
 218             dz00             = _mm_sub_ps(iz0,jz0);
 219             dx01             = _mm_sub_ps(ix0,jx1);
 220             dy01             = _mm_sub_ps(iy0,jy1);
 221             dz01             = _mm_sub_ps(iz0,jz1);
 222             dx02             = _mm_sub_ps(ix0,jx2);
 223             dy02             = _mm_sub_ps(iy0,jy2);
 224             dz02             = _mm_sub_ps(iz0,jz2);
 225             dx10             = _mm_sub_ps(ix1,jx0);
 226             dy10             = _mm_sub_ps(iy1,jy0);
 227             dz10             = _mm_sub_ps(iz1,jz0);
 228             dx11             = _mm_sub_ps(ix1,jx1);
 229             dy11             = _mm_sub_ps(iy1,jy1);
 230             dz11             = _mm_sub_ps(iz1,jz1);
 231             dx12             = _mm_sub_ps(ix1,jx2);
 232             dy12             = _mm_sub_ps(iy1,jy2);
 233             dz12             = _mm_sub_ps(iz1,jz2);
 234             dx20             = _mm_sub_ps(ix2,jx0);
 235             dy20             = _mm_sub_ps(iy2,jy0);
 236             dz20             = _mm_sub_ps(iz2,jz0);
 237             dx21             = _mm_sub_ps(ix2,jx1);
 238             dy21             = _mm_sub_ps(iy2,jy1);
 239             dz21             = _mm_sub_ps(iz2,jz1);
 240             dx22             = _mm_sub_ps(ix2,jx2);
 241             dy22             = _mm_sub_ps(iy2,jy2);
 242             dz22             = _mm_sub_ps(iz2,jz2);
 243
 244             /* Calculate squared distance and things based on it */
 245             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 246             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
 247             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
 248             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
 249             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
 250             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
 251             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
 252             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
 253             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
 254
 255             rinv00           = gmx_mm_invsqrt_ps(rsq00);
 256             rinv01           = gmx_mm_invsqrt_ps(rsq01);
 257             rinv02           = gmx_mm_invsqrt_ps(rsq02);
 258             rinv10           = gmx_mm_invsqrt_ps(rsq10);
 259             rinv11           = gmx_mm_invsqrt_ps(rsq11);
 260             rinv12           = gmx_mm_invsqrt_ps(rsq12);
 261             rinv20           = gmx_mm_invsqrt_ps(rsq20);
 262             rinv21           = gmx_mm_invsqrt_ps(rsq21);
 263             rinv22           = gmx_mm_invsqrt_ps(rsq22);
 264
 265             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 266             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
 267             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
 268             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
 269             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
 270             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
 271             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
 272             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
 273             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
 274
 275             fjx0             = _mm_setzero_ps();
 276             fjy0             = _mm_setzero_ps();
 277             fjz0             = _mm_setzero_ps();
 278             fjx1             = _mm_setzero_ps();
 279             fjy1             = _mm_setzero_ps();
 280             fjz1             = _mm_setzero_ps();
 281             fjx2             = _mm_setzero_ps();
 282             fjy2             = _mm_setzero_ps();
 283             fjz2             = _mm_setzero_ps();
 284
 285             /**************************
 286              * CALCULATE INTERACTIONS *
 287              **************************/
 288
 289             r00              = _mm_mul_ps(rsq00,rinv00);
 290
 291             /* Calculate table index by multiplying r with table scale and truncate to integer */
 292             rt               = _mm_mul_ps(r00,vftabscale);
 293             vfitab           = _mm_cvttps_epi32(rt);
 294 #ifdef __XOP__
 295             vfeps            = _mm_frcz_ps(rt);
 296 #else
 297             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 298 #endif
 299             twovfeps         = _mm_add_ps(vfeps,vfeps);
 300             vfitab           = _mm_slli_epi32(vfitab,3);
 301
 302             /* COULOMB ELECTROSTATICS */
 303             velec            = _mm_mul_ps(qq00,rinv00);
 304             felec            = _mm_mul_ps(velec,rinvsq00);
 305
 306             /* CUBIC SPLINE TABLE DISPERSION */
 307             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
 308             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
 309             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
 310             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
 311             _MM_TRANSPOSE4_PS(Y,F,G,H);
 312             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
 313             VV               = _mm_macc_ps(vfeps,Fp,Y);
 314             vvdw6            = _mm_mul_ps(c6_00,VV);
 315             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
 316             fvdw6            = _mm_mul_ps(c6_00,FF);
 317
 318             /* CUBIC SPLINE TABLE REPULSION */
 319             vfitab           = _mm_add_epi32(vfitab,ifour);
 320             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
 321             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
 322             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
 323             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
 324             _MM_TRANSPOSE4_PS(Y,F,G,H);
 325             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
 326             VV               = _mm_macc_ps(vfeps,Fp,Y);
 327             vvdw12           = _mm_mul_ps(c12_00,VV);
 328             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
 329             fvdw12           = _mm_mul_ps(c12_00,FF);
 330             vvdw             = _mm_add_ps(vvdw12,vvdw6);
 331             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
 332
 333             /* Update potential sum for this i atom from the interaction with this j atom. */
 334             velecsum         = _mm_add_ps(velecsum,velec);
 335             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 336
 337             fscal            = _mm_add_ps(felec,fvdw);
 338
 339              /* Update vectorial force */
 340             fix0             = _mm_macc_ps(dx00,fscal,fix0);
 341             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 342             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 343
 344             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
 345             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
 346             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 347
 348             /**************************
 349              * CALCULATE INTERACTIONS *
 350              **************************/
 351
 352             /* COULOMB ELECTROSTATICS */
 353             velec            = _mm_mul_ps(qq01,rinv01);
 354             felec            = _mm_mul_ps(velec,rinvsq01);
 355
 356             /* Update potential sum for this i atom from the interaction with this j atom. */
 357             velecsum         = _mm_add_ps(velecsum,velec);
 358
 359             fscal            = felec;
 360
 361              /* Update vectorial force */
 362             fix0             = _mm_macc_ps(dx01,fscal,fix0);
 363             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
 364             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
 365
 366             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
 367             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
 368             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
 369
 370             /**************************
 371              * CALCULATE INTERACTIONS *
 372              **************************/
 373
 374             /* COULOMB ELECTROSTATICS */
 375             velec            = _mm_mul_ps(qq02,rinv02);
 376             felec            = _mm_mul_ps(velec,rinvsq02);
 377
 378             /* Update potential sum for this i atom from the interaction with this j atom. */
 379             velecsum         = _mm_add_ps(velecsum,velec);
 380
 381             fscal            = felec;
 382
 383              /* Update vectorial force */
 384             fix0             = _mm_macc_ps(dx02,fscal,fix0);
 385             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
 386             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
 387
 388             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
 389             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
 390             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
 391
 392             /**************************
 393              * CALCULATE INTERACTIONS *
 394              **************************/
 395
 396             /* COULOMB ELECTROSTATICS */
 397             velec            = _mm_mul_ps(qq10,rinv10);
 398             felec            = _mm_mul_ps(velec,rinvsq10);
 399
 400             /* Update potential sum for this i atom from the interaction with this j atom. */
 401             velecsum         = _mm_add_ps(velecsum,velec);
 402
 403             fscal            = felec;
 404
 405              /* Update vectorial force */
 406             fix1             = _mm_macc_ps(dx10,fscal,fix1);
 407             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
 408             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 409
 410             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
 411             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
 412             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 413
 414             /**************************
 415              * CALCULATE INTERACTIONS *
 416              **************************/
 417
 418             /* COULOMB ELECTROSTATICS */
 419             velec            = _mm_mul_ps(qq11,rinv11);
 420             felec            = _mm_mul_ps(velec,rinvsq11);
 421
 422             /* Update potential sum for this i atom from the interaction with this j atom. */
 423             velecsum         = _mm_add_ps(velecsum,velec);
 424
 425             fscal            = felec;
 426
 427              /* Update vectorial force */
 428             fix1             = _mm_macc_ps(dx11,fscal,fix1);
 429             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
 430             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
 431
 432             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
 433             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
 434             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
 435
 436             /**************************
 437              * CALCULATE INTERACTIONS *
 438              **************************/
 439
 440             /* COULOMB ELECTROSTATICS */
 441             velec            = _mm_mul_ps(qq12,rinv12);
 442             felec            = _mm_mul_ps(velec,rinvsq12);
 443
 444             /* Update potential sum for this i atom from the interaction with this j atom. */
 445             velecsum         = _mm_add_ps(velecsum,velec);
 446
 447             fscal            = felec;
 448
 449              /* Update vectorial force */
 450             fix1             = _mm_macc_ps(dx12,fscal,fix1);
 451             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
 452             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
 453
 454             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
 455             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
 456             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
 457
 458             /**************************
 459              * CALCULATE INTERACTIONS *
 460              **************************/
 461
 462             /* COULOMB ELECTROSTATICS */
 463             velec            = _mm_mul_ps(qq20,rinv20);
 464             felec            = _mm_mul_ps(velec,rinvsq20);
 465
 466             /* Update potential sum for this i atom from the interaction with this j atom. */
 467             velecsum         = _mm_add_ps(velecsum,velec);
 468
 469             fscal            = felec;
 470
 471              /* Update vectorial force */
 472             fix2             = _mm_macc_ps(dx20,fscal,fix2);
 473             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
 474             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 475
 476             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
 477             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
 478             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 479
 480             /**************************
 481              * CALCULATE INTERACTIONS *
 482              **************************/
 483
 484             /* COULOMB ELECTROSTATICS */
 485             velec            = _mm_mul_ps(qq21,rinv21);
 486             felec            = _mm_mul_ps(velec,rinvsq21);
 487
 488             /* Update potential sum for this i atom from the interaction with this j atom. */
 489             velecsum         = _mm_add_ps(velecsum,velec);
 490
 491             fscal            = felec;
 492
 493              /* Update vectorial force */
 494             fix2             = _mm_macc_ps(dx21,fscal,fix2);
 495             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
 496             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
 497
 498             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
 499             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
 500             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
 501
 502             /**************************
 503              * CALCULATE INTERACTIONS *
 504              **************************/
 505
 506             /* COULOMB ELECTROSTATICS */
 507             velec            = _mm_mul_ps(qq22,rinv22);
 508             felec            = _mm_mul_ps(velec,rinvsq22);
 509
 510             /* Update potential sum for this i atom from the interaction with this j atom. */
 511             velecsum         = _mm_add_ps(velecsum,velec);
 512
 513             fscal            = felec;
 514
 515              /* Update vectorial force */
 516             fix2             = _mm_macc_ps(dx22,fscal,fix2);
 517             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
 518             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
 519
 520             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
 521             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
 522             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
 523
 524             fjptrA             = f+j_coord_offsetA;
 525             fjptrB             = f+j_coord_offsetB;
 526             fjptrC             = f+j_coord_offsetC;
 527             fjptrD             = f+j_coord_offsetD;
 528
 529             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 530                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 531
 532             /* Inner loop uses 314 flops */
 533         }
 534
 535         if(jidx<j_index_end)
 536         {
 537
 538             /* Get j neighbor index, and coordinate index */
 539             jnrlistA         = jjnr[jidx];
 540             jnrlistB         = jjnr[jidx+1];
 541             jnrlistC         = jjnr[jidx+2];
 542             jnrlistD         = jjnr[jidx+3];
 543             /* Sign of each element will be negative for non-real atoms.
 544              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 545              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 546              */
 547             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 548             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 549             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 550             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 551             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 552             j_coord_offsetA  = DIM*jnrA;
 553             j_coord_offsetB  = DIM*jnrB;
 554             j_coord_offsetC  = DIM*jnrC;
 555             j_coord_offsetD  = DIM*jnrD;
 556
 557             /* load j atom coordinates */
 558             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 559                                               x+j_coord_offsetC,x+j_coord_offsetD,
 560                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 561
 562             /* Calculate displacement vector */
 563             dx00             = _mm_sub_ps(ix0,jx0);
 564             dy00             = _mm_sub_ps(iy0,jy0);
 565             dz00             = _mm_sub_ps(iz0,jz0);
 566             dx01             = _mm_sub_ps(ix0,jx1);
 567             dy01             = _mm_sub_ps(iy0,jy1);
 568             dz01             = _mm_sub_ps(iz0,jz1);
 569             dx02             = _mm_sub_ps(ix0,jx2);
 570             dy02             = _mm_sub_ps(iy0,jy2);
 571             dz02             = _mm_sub_ps(iz0,jz2);
 572             dx10             = _mm_sub_ps(ix1,jx0);
 573             dy10             = _mm_sub_ps(iy1,jy0);
 574             dz10             = _mm_sub_ps(iz1,jz0);
 575             dx11             = _mm_sub_ps(ix1,jx1);
 576             dy11             = _mm_sub_ps(iy1,jy1);
 577             dz11             = _mm_sub_ps(iz1,jz1);
 578             dx12             = _mm_sub_ps(ix1,jx2);
 579             dy12             = _mm_sub_ps(iy1,jy2);
 580             dz12             = _mm_sub_ps(iz1,jz2);
 581             dx20             = _mm_sub_ps(ix2,jx0);
 582             dy20             = _mm_sub_ps(iy2,jy0);
 583             dz20             = _mm_sub_ps(iz2,jz0);
 584             dx21             = _mm_sub_ps(ix2,jx1);
 585             dy21             = _mm_sub_ps(iy2,jy1);
 586             dz21             = _mm_sub_ps(iz2,jz1);
 587             dx22             = _mm_sub_ps(ix2,jx2);
 588             dy22             = _mm_sub_ps(iy2,jy2);
 589             dz22             = _mm_sub_ps(iz2,jz2);
 590
 591             /* Calculate squared distance and things based on it */
 592             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 593             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
 594             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
 595             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
 596             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
 597             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
 598             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
 599             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
 600             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
 601
 602             rinv00           = gmx_mm_invsqrt_ps(rsq00);
 603             rinv01           = gmx_mm_invsqrt_ps(rsq01);
 604             rinv02           = gmx_mm_invsqrt_ps(rsq02);
 605             rinv10           = gmx_mm_invsqrt_ps(rsq10);
 606             rinv11           = gmx_mm_invsqrt_ps(rsq11);
 607             rinv12           = gmx_mm_invsqrt_ps(rsq12);
 608             rinv20           = gmx_mm_invsqrt_ps(rsq20);
 609             rinv21           = gmx_mm_invsqrt_ps(rsq21);
 610             rinv22           = gmx_mm_invsqrt_ps(rsq22);
 611
 612             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 613             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
 614             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
 615             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
 616             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
 617             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
 618             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
 619             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
 620             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
 621
 622             fjx0             = _mm_setzero_ps();
 623             fjy0             = _mm_setzero_ps();
 624             fjz0             = _mm_setzero_ps();
 625             fjx1             = _mm_setzero_ps();
 626             fjy1             = _mm_setzero_ps();
 627             fjz1             = _mm_setzero_ps();
 628             fjx2             = _mm_setzero_ps();
 629             fjy2             = _mm_setzero_ps();
 630             fjz2             = _mm_setzero_ps();
 631
 632             /**************************
 633              * CALCULATE INTERACTIONS *
 634              **************************/
 635
 636             r00              = _mm_mul_ps(rsq00,rinv00);
 637             r00              = _mm_andnot_ps(dummy_mask,r00);
 638
 639             /* Calculate table index by multiplying r with table scale and truncate to integer */
 640             rt               = _mm_mul_ps(r00,vftabscale);
 641             vfitab           = _mm_cvttps_epi32(rt);
 642 #ifdef __XOP__
 643             vfeps            = _mm_frcz_ps(rt);
 644 #else
 645             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
 646 #endif
 647             twovfeps         = _mm_add_ps(vfeps,vfeps);
 648             vfitab           = _mm_slli_epi32(vfitab,3);
 649
 650             /* COULOMB ELECTROSTATICS */
 651             velec            = _mm_mul_ps(qq00,rinv00);
 652             felec            = _mm_mul_ps(velec,rinvsq00);
 653
 654             /* CUBIC SPLINE TABLE DISPERSION */
 655             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
 656             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
 657             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
 658             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
 659             _MM_TRANSPOSE4_PS(Y,F,G,H);
 660             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
 661             VV               = _mm_macc_ps(vfeps,Fp,Y);
 662             vvdw6            = _mm_mul_ps(c6_00,VV);
 663             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
 664             fvdw6            = _mm_mul_ps(c6_00,FF);
 665
 666             /* CUBIC SPLINE TABLE REPULSION */
 667             vfitab           = _mm_add_epi32(vfitab,ifour);
 668             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
 669             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
 670             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
 671             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
 672             _MM_TRANSPOSE4_PS(Y,F,G,H);
 673             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
 674             VV               = _mm_macc_ps(vfeps,Fp,Y);
 675             vvdw12           = _mm_mul_ps(c12_00,VV);
 676             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
 677             fvdw12           = _mm_mul_ps(c12_00,FF);
 678             vvdw             = _mm_add_ps(vvdw12,vvdw6);
 679             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
 680
 681             /* Update potential sum for this i atom from the interaction with this j atom. */
 682             velec            = _mm_andnot_ps(dummy_mask,velec);
 683             velecsum         = _mm_add_ps(velecsum,velec);
 684             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
 685             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
 686
 687             fscal            = _mm_add_ps(felec,fvdw);
 688
 689             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 690
 691              /* Update vectorial force */
 692             fix0             = _mm_macc_ps(dx00,fscal,fix0);
 693             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 694             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 695
 696             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
 697             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
 698             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 699
 700             /**************************
 701              * CALCULATE INTERACTIONS *
 702              **************************/
 703
 704             /* COULOMB ELECTROSTATICS */
 705             velec            = _mm_mul_ps(qq01,rinv01);
 706             felec            = _mm_mul_ps(velec,rinvsq01);
 707
 708             /* Update potential sum for this i atom from the interaction with this j atom. */
 709             velec            = _mm_andnot_ps(dummy_mask,velec);
 710             velecsum         = _mm_add_ps(velecsum,velec);
 711
 712             fscal            = felec;
 713
 714             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 715
 716              /* Update vectorial force */
 717             fix0             = _mm_macc_ps(dx01,fscal,fix0);
 718             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
 719             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
 720
 721             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
 722             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
 723             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
 724
 725             /**************************
 726              * CALCULATE INTERACTIONS *
 727              **************************/
 728
 729             /* COULOMB ELECTROSTATICS */
 730             velec            = _mm_mul_ps(qq02,rinv02);
 731             felec            = _mm_mul_ps(velec,rinvsq02);
 732
 733             /* Update potential sum for this i atom from the interaction with this j atom. */
 734             velec            = _mm_andnot_ps(dummy_mask,velec);
 735             velecsum         = _mm_add_ps(velecsum,velec);
 736
 737             fscal            = felec;
 738
 739             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 740
 741              /* Update vectorial force */
 742             fix0             = _mm_macc_ps(dx02,fscal,fix0);
 743             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
 744             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
 745
 746             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
 747             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
 748             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
 749
 750             /**************************
 751              * CALCULATE INTERACTIONS *
 752              **************************/
 753
 754             /* COULOMB ELECTROSTATICS */
 755             velec            = _mm_mul_ps(qq10,rinv10);
 756             felec            = _mm_mul_ps(velec,rinvsq10);
 757
 758             /* Update potential sum for this i atom from the interaction with this j atom. */
 759             velec            = _mm_andnot_ps(dummy_mask,velec);
 760             velecsum         = _mm_add_ps(velecsum,velec);
 761
 762             fscal            = felec;
 763
 764             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 765
 766              /* Update vectorial force */
 767             fix1             = _mm_macc_ps(dx10,fscal,fix1);
 768             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
 769             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 770
 771             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
 772             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
 773             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 774
 775             /**************************
 776              * CALCULATE INTERACTIONS *
 777              **************************/
 778
 779             /* COULOMB ELECTROSTATICS */
 780             velec            = _mm_mul_ps(qq11,rinv11);
 781             felec            = _mm_mul_ps(velec,rinvsq11);
 782
 783             /* Update potential sum for this i atom from the interaction with this j atom. */
 784             velec            = _mm_andnot_ps(dummy_mask,velec);
 785             velecsum         = _mm_add_ps(velecsum,velec);
 786
 787             fscal            = felec;
 788
 789             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 790
 791              /* Update vectorial force */
 792             fix1             = _mm_macc_ps(dx11,fscal,fix1);
 793             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
 794             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
 795
 796             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
 797             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
 798             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
 799
 800             /**************************
 801              * CALCULATE INTERACTIONS *
 802              **************************/
 803
 804             /* COULOMB ELECTROSTATICS */
 805             velec            = _mm_mul_ps(qq12,rinv12);
 806             felec            = _mm_mul_ps(velec,rinvsq12);
 807
 808             /* Update potential sum for this i atom from the interaction with this j atom. */
 809             velec            = _mm_andnot_ps(dummy_mask,velec);
 810             velecsum         = _mm_add_ps(velecsum,velec);
 811
 812             fscal            = felec;
 813
 814             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 815
 816              /* Update vectorial force */
 817             fix1             = _mm_macc_ps(dx12,fscal,fix1);
 818             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
 819             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
 820
 821             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
 822             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
 823             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
 824
 825             /**************************
 826              * CALCULATE INTERACTIONS *
 827              **************************/
 828
 829             /* COULOMB ELECTROSTATICS */
 830             velec            = _mm_mul_ps(qq20,rinv20);
 831             felec            = _mm_mul_ps(velec,rinvsq20);
 832
 833             /* Update potential sum for this i atom from the interaction with this j atom. */
 834             velec            = _mm_andnot_ps(dummy_mask,velec);
 835             velecsum         = _mm_add_ps(velecsum,velec);
 836
 837             fscal            = felec;
 838
 839             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 840
 841              /* Update vectorial force */
 842             fix2             = _mm_macc_ps(dx20,fscal,fix2);
 843             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
 844             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 845
 846             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
 847             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
 848             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 849
 850             /**************************
 851              * CALCULATE INTERACTIONS *
 852              **************************/
 853
 854             /* COULOMB ELECTROSTATICS */
 855             velec            = _mm_mul_ps(qq21,rinv21);
 856             felec            = _mm_mul_ps(velec,rinvsq21);
 857
 858             /* Update potential sum for this i atom from the interaction with this j atom. */
 859             velec            = _mm_andnot_ps(dummy_mask,velec);
 860             velecsum         = _mm_add_ps(velecsum,velec);
 861
 862             fscal            = felec;
 863
 864             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 865
 866              /* Update vectorial force */
 867             fix2             = _mm_macc_ps(dx21,fscal,fix2);
 868             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
 869             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
 870
 871             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
 872             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
 873             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
 874
 875             /**************************
 876              * CALCULATE INTERACTIONS *
 877              **************************/
 878
 879             /* COULOMB ELECTROSTATICS */
 880             velec            = _mm_mul_ps(qq22,rinv22);
 881             felec            = _mm_mul_ps(velec,rinvsq22);
 882
 883             /* Update potential sum for this i atom from the interaction with this j atom. */
 884             velec            = _mm_andnot_ps(dummy_mask,velec);
 885             velecsum         = _mm_add_ps(velecsum,velec);
 886
 887             fscal            = felec;
 888
 889             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 890
 891              /* Update vectorial force */
 892             fix2             = _mm_macc_ps(dx22,fscal,fix2);
 893             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
 894             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
 895
 896             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
 897             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
 898             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
 899
 900             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 901             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 902             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 903             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 904
 905             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 906                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 907
 908             /* Inner loop uses 315 flops */
 909         }
 910
 911         /* End of innermost loop */
 912
 913         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
 914                                               f+i_coord_offset,fshift+i_shift_offset);
 915
 916         ggid                        = gid[iidx];
 917         /* Update potential energies */
 918         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 919         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
 920
 921         /* Increment number of inner iterations */
 922         inneriter                  += j_index_end - j_index_start;
 923
 924         /* Outer loop uses 20 flops */
 925     }
 926
 927     /* Increment number of outer iterations */
 928     outeriter        += nri;
 929
 930     /* Update outer/inner flops */
 931
 932     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*315);
 933 }
 934 /*
 935  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_128_fma_single
 936  * Electrostatics interaction: Coulomb
 937  * VdW interaction:            CubicSplineTable
 938  * Geometry:                   Water3-Water3
 939  * Calculate force/pot:        Force
 940  */
 941 void
 942 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_128_fma_single
 943                     (t_nblist * gmx_restrict                nlist,
 944                      rvec * gmx_restrict                    xx,
 945                      rvec * gmx_restrict                    ff,
 946                      t_forcerec * gmx_restrict              fr,
 947                      t_mdatoms * gmx_restrict               mdatoms,
 948                      nb_kernel_data_t * gmx_restrict        kernel_data,
 949                      t_nrnb * gmx_restrict                  nrnb)
 950 {
 951     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 952      * just 0 for non-waters.
 953      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
 954      * jnr indices corresponding to data put in the four positions in the SIMD register.
 955      */
 956     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 957     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 958     int              jnrA,jnrB,jnrC,jnrD;
 959     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 960     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 961     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 962     real             rcutoff_scalar;
 963     real             *shiftvec,*fshift,*x,*f;
 964     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 965     real             scratch[4*DIM];
 966     __m128           fscal,rcutoff,rcutoff2,jidxall;
 967     int              vdwioffset0;
 968     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 969     int              vdwioffset1;
 970     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
 971     int              vdwioffset2;
 972     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
 973     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 974     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 975     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
 976     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
 977     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
 978     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
 979     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 980     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
 981     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
 982     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
 983     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
 984     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
 985     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
 986     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
 987     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
 988     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 989     real             *charge;
 990     int              nvdwtype;
 991     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
 992     int              *vdwtype;
 993     real             *vdwparam;
 994     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
 995     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
 996     __m128i          vfitab;
 997     __m128i          ifour       = _mm_set1_epi32(4);
 998     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
 999     real             *vftab;
1000     __m128           dummy_mask,cutoff_mask;
1001     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1002     __m128           one     = _mm_set1_ps(1.0);
1003     __m128           two     = _mm_set1_ps(2.0);
1004     x                = xx[0];
1005     f                = ff[0];
1006
1007     nri              = nlist->nri;
1008     iinr             = nlist->iinr;
1009     jindex           = nlist->jindex;
1010     jjnr             = nlist->jjnr;
1011     shiftidx         = nlist->shift;
1012     gid              = nlist->gid;
1013     shiftvec         = fr->shift_vec[0];
1014     fshift           = fr->fshift[0];
1015     facel            = _mm_set1_ps(fr->epsfac);
1016     charge           = mdatoms->chargeA;
1017     nvdwtype         = fr->ntype;
1018     vdwparam         = fr->nbfp;
1019     vdwtype          = mdatoms->typeA;
1020
1021     vftab            = kernel_data->table_vdw->data;
1022     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
1023
1024     /* Setup water-specific parameters */
1025     inr              = nlist->iinr[0];
1026     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1027     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1028     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1029     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1030
1031     jq0              = _mm_set1_ps(charge[inr+0]);
1032     jq1              = _mm_set1_ps(charge[inr+1]);
1033     jq2              = _mm_set1_ps(charge[inr+2]);
1034     vdwjidx0A        = 2*vdwtype[inr+0];
1035     qq00             = _mm_mul_ps(iq0,jq0);
1036     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1037     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1038     qq01             = _mm_mul_ps(iq0,jq1);
1039     qq02             = _mm_mul_ps(iq0,jq2);
1040     qq10             = _mm_mul_ps(iq1,jq0);
1041     qq11             = _mm_mul_ps(iq1,jq1);
1042     qq12             = _mm_mul_ps(iq1,jq2);
1043     qq20             = _mm_mul_ps(iq2,jq0);
1044     qq21             = _mm_mul_ps(iq2,jq1);
1045     qq22             = _mm_mul_ps(iq2,jq2);
1046
1047     /* Avoid stupid compiler warnings */
1048     jnrA = jnrB = jnrC = jnrD = 0;
1049     j_coord_offsetA = 0;
1050     j_coord_offsetB = 0;
1051     j_coord_offsetC = 0;
1052     j_coord_offsetD = 0;
1053
1054     outeriter        = 0;
1055     inneriter        = 0;
1056
1057     for(iidx=0;iidx<4*DIM;iidx++)
1058     {
1059         scratch[iidx] = 0.0;
1060     }
1061
1062     /* Start outer loop over neighborlists */
1063     for(iidx=0; iidx<nri; iidx++)
1064     {
1065         /* Load shift vector for this list */
1066         i_shift_offset   = DIM*shiftidx[iidx];
1067
1068         /* Load limits for loop over neighbors */
1069         j_index_start    = jindex[iidx];
1070         j_index_end      = jindex[iidx+1];
1071
1072         /* Get outer coordinate index */
1073         inr              = iinr[iidx];
1074         i_coord_offset   = DIM*inr;
1075
1076         /* Load i particle coords and add shift vector */
1077         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1078                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1079
1080         fix0             = _mm_setzero_ps();
1081         fiy0             = _mm_setzero_ps();
1082         fiz0             = _mm_setzero_ps();
1083         fix1             = _mm_setzero_ps();
1084         fiy1             = _mm_setzero_ps();
1085         fiz1             = _mm_setzero_ps();
1086         fix2             = _mm_setzero_ps();
1087         fiy2             = _mm_setzero_ps();
1088         fiz2             = _mm_setzero_ps();
1089
1090         /* Start inner kernel loop */
1091         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1092         {
1093
1094             /* Get j neighbor index, and coordinate index */
1095             jnrA             = jjnr[jidx];
1096             jnrB             = jjnr[jidx+1];
1097             jnrC             = jjnr[jidx+2];
1098             jnrD             = jjnr[jidx+3];
1099             j_coord_offsetA  = DIM*jnrA;
1100             j_coord_offsetB  = DIM*jnrB;
1101             j_coord_offsetC  = DIM*jnrC;
1102             j_coord_offsetD  = DIM*jnrD;
1103
1104             /* load j atom coordinates */
1105             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1106                                               x+j_coord_offsetC,x+j_coord_offsetD,
1107                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1108
1109             /* Calculate displacement vector */
1110             dx00             = _mm_sub_ps(ix0,jx0);
1111             dy00             = _mm_sub_ps(iy0,jy0);
1112             dz00             = _mm_sub_ps(iz0,jz0);
1113             dx01             = _mm_sub_ps(ix0,jx1);
1114             dy01             = _mm_sub_ps(iy0,jy1);
1115             dz01             = _mm_sub_ps(iz0,jz1);
1116             dx02             = _mm_sub_ps(ix0,jx2);
1117             dy02             = _mm_sub_ps(iy0,jy2);
1118             dz02             = _mm_sub_ps(iz0,jz2);
1119             dx10             = _mm_sub_ps(ix1,jx0);
1120             dy10             = _mm_sub_ps(iy1,jy0);
1121             dz10             = _mm_sub_ps(iz1,jz0);
1122             dx11             = _mm_sub_ps(ix1,jx1);
1123             dy11             = _mm_sub_ps(iy1,jy1);
1124             dz11             = _mm_sub_ps(iz1,jz1);
1125             dx12             = _mm_sub_ps(ix1,jx2);
1126             dy12             = _mm_sub_ps(iy1,jy2);
1127             dz12             = _mm_sub_ps(iz1,jz2);
1128             dx20             = _mm_sub_ps(ix2,jx0);
1129             dy20             = _mm_sub_ps(iy2,jy0);
1130             dz20             = _mm_sub_ps(iz2,jz0);
1131             dx21             = _mm_sub_ps(ix2,jx1);
1132             dy21             = _mm_sub_ps(iy2,jy1);
1133             dz21             = _mm_sub_ps(iz2,jz1);
1134             dx22             = _mm_sub_ps(ix2,jx2);
1135             dy22             = _mm_sub_ps(iy2,jy2);
1136             dz22             = _mm_sub_ps(iz2,jz2);
1137
1138             /* Calculate squared distance and things based on it */
1139             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1140             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1141             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1142             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1143             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1144             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1145             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1146             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1147             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1148
1149             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1150             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1151             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1152             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1153             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1154             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1155             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1156             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1157             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1158
1159             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1160             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1161             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1162             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1163             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1164             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1165             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1166             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1167             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1168
1169             fjx0             = _mm_setzero_ps();
1170             fjy0             = _mm_setzero_ps();
1171             fjz0             = _mm_setzero_ps();
1172             fjx1             = _mm_setzero_ps();
1173             fjy1             = _mm_setzero_ps();
1174             fjz1             = _mm_setzero_ps();
1175             fjx2             = _mm_setzero_ps();
1176             fjy2             = _mm_setzero_ps();
1177             fjz2             = _mm_setzero_ps();
1178
1179             /**************************
1180              * CALCULATE INTERACTIONS *
1181              **************************/
1182
1183             r00              = _mm_mul_ps(rsq00,rinv00);
1184
1185             /* Calculate table index by multiplying r with table scale and truncate to integer */
1186             rt               = _mm_mul_ps(r00,vftabscale);
1187             vfitab           = _mm_cvttps_epi32(rt);
1188 #ifdef __XOP__
1189             vfeps            = _mm_frcz_ps(rt);
1190 #else
1191             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1192 #endif
1193             twovfeps         = _mm_add_ps(vfeps,vfeps);
1194             vfitab           = _mm_slli_epi32(vfitab,3);
1195
1196             /* COULOMB ELECTROSTATICS */
1197             velec            = _mm_mul_ps(qq00,rinv00);
1198             felec            = _mm_mul_ps(velec,rinvsq00);
1199
1200             /* CUBIC SPLINE TABLE DISPERSION */
1201             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1202             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1203             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1204             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1205             _MM_TRANSPOSE4_PS(Y,F,G,H);
1206             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1207             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1208             fvdw6            = _mm_mul_ps(c6_00,FF);
1209
1210             /* CUBIC SPLINE TABLE REPULSION */
1211             vfitab           = _mm_add_epi32(vfitab,ifour);
1212             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1213             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1214             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1215             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1216             _MM_TRANSPOSE4_PS(Y,F,G,H);
1217             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1218             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1219             fvdw12           = _mm_mul_ps(c12_00,FF);
1220             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1221
1222             fscal            = _mm_add_ps(felec,fvdw);
1223
1224              /* Update vectorial force */
1225             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1226             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1227             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1228
1229             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1230             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1231             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1232
1233             /**************************
1234              * CALCULATE INTERACTIONS *
1235              **************************/
1236
1237             /* COULOMB ELECTROSTATICS */
1238             velec            = _mm_mul_ps(qq01,rinv01);
1239             felec            = _mm_mul_ps(velec,rinvsq01);
1240
1241             fscal            = felec;
1242
1243              /* Update vectorial force */
1244             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1245             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1246             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1247
1248             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1249             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1250             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1251
1252             /**************************
1253              * CALCULATE INTERACTIONS *
1254              **************************/
1255
1256             /* COULOMB ELECTROSTATICS */
1257             velec            = _mm_mul_ps(qq02,rinv02);
1258             felec            = _mm_mul_ps(velec,rinvsq02);
1259
1260             fscal            = felec;
1261
1262              /* Update vectorial force */
1263             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1264             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1265             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1266
1267             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1268             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1269             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1270
1271             /**************************
1272              * CALCULATE INTERACTIONS *
1273              **************************/
1274
1275             /* COULOMB ELECTROSTATICS */
1276             velec            = _mm_mul_ps(qq10,rinv10);
1277             felec            = _mm_mul_ps(velec,rinvsq10);
1278
1279             fscal            = felec;
1280
1281              /* Update vectorial force */
1282             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1283             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1284             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1285
1286             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1287             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1288             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1289
1290             /**************************
1291              * CALCULATE INTERACTIONS *
1292              **************************/
1293
1294             /* COULOMB ELECTROSTATICS */
1295             velec            = _mm_mul_ps(qq11,rinv11);
1296             felec            = _mm_mul_ps(velec,rinvsq11);
1297
1298             fscal            = felec;
1299
1300              /* Update vectorial force */
1301             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1302             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1303             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1304
1305             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1306             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1307             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1308
1309             /**************************
1310              * CALCULATE INTERACTIONS *
1311              **************************/
1312
1313             /* COULOMB ELECTROSTATICS */
1314             velec            = _mm_mul_ps(qq12,rinv12);
1315             felec            = _mm_mul_ps(velec,rinvsq12);
1316
1317             fscal            = felec;
1318
1319              /* Update vectorial force */
1320             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1321             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1322             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1323
1324             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1325             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1326             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1327
1328             /**************************
1329              * CALCULATE INTERACTIONS *
1330              **************************/
1331
1332             /* COULOMB ELECTROSTATICS */
1333             velec            = _mm_mul_ps(qq20,rinv20);
1334             felec            = _mm_mul_ps(velec,rinvsq20);
1335
1336             fscal            = felec;
1337
1338              /* Update vectorial force */
1339             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1340             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1341             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1342
1343             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1344             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1345             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1346
1347             /**************************
1348              * CALCULATE INTERACTIONS *
1349              **************************/
1350
1351             /* COULOMB ELECTROSTATICS */
1352             velec            = _mm_mul_ps(qq21,rinv21);
1353             felec            = _mm_mul_ps(velec,rinvsq21);
1354
1355             fscal            = felec;
1356
1357              /* Update vectorial force */
1358             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1359             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1360             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1361
1362             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1363             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1364             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1365
1366             /**************************
1367              * CALCULATE INTERACTIONS *
1368              **************************/
1369
1370             /* COULOMB ELECTROSTATICS */
1371             velec            = _mm_mul_ps(qq22,rinv22);
1372             felec            = _mm_mul_ps(velec,rinvsq22);
1373
1374             fscal            = felec;
1375
1376              /* Update vectorial force */
1377             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1378             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1379             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1380
1381             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1382             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1383             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1384
1385             fjptrA             = f+j_coord_offsetA;
1386             fjptrB             = f+j_coord_offsetB;
1387             fjptrC             = f+j_coord_offsetC;
1388             fjptrD             = f+j_coord_offsetD;
1389
1390             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1391                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1392
1393             /* Inner loop uses 297 flops */
1394         }
1395
1396         if(jidx<j_index_end)
1397         {
1398
1399             /* Get j neighbor index, and coordinate index */
1400             jnrlistA         = jjnr[jidx];
1401             jnrlistB         = jjnr[jidx+1];
1402             jnrlistC         = jjnr[jidx+2];
1403             jnrlistD         = jjnr[jidx+3];
1404             /* Sign of each element will be negative for non-real atoms.
1405              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1406              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1407              */
1408             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1409             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1410             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1411             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1412             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1413             j_coord_offsetA  = DIM*jnrA;
1414             j_coord_offsetB  = DIM*jnrB;
1415             j_coord_offsetC  = DIM*jnrC;
1416             j_coord_offsetD  = DIM*jnrD;
1417
1418             /* load j atom coordinates */
1419             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1420                                               x+j_coord_offsetC,x+j_coord_offsetD,
1421                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1422
1423             /* Calculate displacement vector */
1424             dx00             = _mm_sub_ps(ix0,jx0);
1425             dy00             = _mm_sub_ps(iy0,jy0);
1426             dz00             = _mm_sub_ps(iz0,jz0);
1427             dx01             = _mm_sub_ps(ix0,jx1);
1428             dy01             = _mm_sub_ps(iy0,jy1);
1429             dz01             = _mm_sub_ps(iz0,jz1);
1430             dx02             = _mm_sub_ps(ix0,jx2);
1431             dy02             = _mm_sub_ps(iy0,jy2);
1432             dz02             = _mm_sub_ps(iz0,jz2);
1433             dx10             = _mm_sub_ps(ix1,jx0);
1434             dy10             = _mm_sub_ps(iy1,jy0);
1435             dz10             = _mm_sub_ps(iz1,jz0);
1436             dx11             = _mm_sub_ps(ix1,jx1);
1437             dy11             = _mm_sub_ps(iy1,jy1);
1438             dz11             = _mm_sub_ps(iz1,jz1);
1439             dx12             = _mm_sub_ps(ix1,jx2);
1440             dy12             = _mm_sub_ps(iy1,jy2);
1441             dz12             = _mm_sub_ps(iz1,jz2);
1442             dx20             = _mm_sub_ps(ix2,jx0);
1443             dy20             = _mm_sub_ps(iy2,jy0);
1444             dz20             = _mm_sub_ps(iz2,jz0);
1445             dx21             = _mm_sub_ps(ix2,jx1);
1446             dy21             = _mm_sub_ps(iy2,jy1);
1447             dz21             = _mm_sub_ps(iz2,jz1);
1448             dx22             = _mm_sub_ps(ix2,jx2);
1449             dy22             = _mm_sub_ps(iy2,jy2);
1450             dz22             = _mm_sub_ps(iz2,jz2);
1451
1452             /* Calculate squared distance and things based on it */
1453             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1454             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1455             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1456             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1457             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1458             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1459             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1460             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1461             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1462
1463             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1464             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1465             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1466             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1467             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1468             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1469             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1470             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1471             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1472
1473             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1474             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1475             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1476             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1477             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1478             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1479             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1480             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1481             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1482
1483             fjx0             = _mm_setzero_ps();
1484             fjy0             = _mm_setzero_ps();
1485             fjz0             = _mm_setzero_ps();
1486             fjx1             = _mm_setzero_ps();
1487             fjy1             = _mm_setzero_ps();
1488             fjz1             = _mm_setzero_ps();
1489             fjx2             = _mm_setzero_ps();
1490             fjy2             = _mm_setzero_ps();
1491             fjz2             = _mm_setzero_ps();
1492
1493             /**************************
1494              * CALCULATE INTERACTIONS *
1495              **************************/
1496
1497             r00              = _mm_mul_ps(rsq00,rinv00);
1498             r00              = _mm_andnot_ps(dummy_mask,r00);
1499
1500             /* Calculate table index by multiplying r with table scale and truncate to integer */
1501             rt               = _mm_mul_ps(r00,vftabscale);
1502             vfitab           = _mm_cvttps_epi32(rt);
1503 #ifdef __XOP__
1504             vfeps            = _mm_frcz_ps(rt);
1505 #else
1506             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1507 #endif
1508             twovfeps         = _mm_add_ps(vfeps,vfeps);
1509             vfitab           = _mm_slli_epi32(vfitab,3);
1510
1511             /* COULOMB ELECTROSTATICS */
1512             velec            = _mm_mul_ps(qq00,rinv00);
1513             felec            = _mm_mul_ps(velec,rinvsq00);
1514
1515             /* CUBIC SPLINE TABLE DISPERSION */
1516             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1517             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1518             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1519             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1520             _MM_TRANSPOSE4_PS(Y,F,G,H);
1521             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1522             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1523             fvdw6            = _mm_mul_ps(c6_00,FF);
1524
1525             /* CUBIC SPLINE TABLE REPULSION */
1526             vfitab           = _mm_add_epi32(vfitab,ifour);
1527             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1528             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1529             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1530             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1531             _MM_TRANSPOSE4_PS(Y,F,G,H);
1532             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1533             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1534             fvdw12           = _mm_mul_ps(c12_00,FF);
1535             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1536
1537             fscal            = _mm_add_ps(felec,fvdw);
1538
1539             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1540
1541              /* Update vectorial force */
1542             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1543             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1544             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1545
1546             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1547             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1548             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1549
1550             /**************************
1551              * CALCULATE INTERACTIONS *
1552              **************************/
1553
1554             /* COULOMB ELECTROSTATICS */
1555             velec            = _mm_mul_ps(qq01,rinv01);
1556             felec            = _mm_mul_ps(velec,rinvsq01);
1557
1558             fscal            = felec;
1559
1560             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1561
1562              /* Update vectorial force */
1563             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1564             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1565             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1566
1567             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1568             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1569             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1570
1571             /**************************
1572              * CALCULATE INTERACTIONS *
1573              **************************/
1574
1575             /* COULOMB ELECTROSTATICS */
1576             velec            = _mm_mul_ps(qq02,rinv02);
1577             felec            = _mm_mul_ps(velec,rinvsq02);
1578
1579             fscal            = felec;
1580
1581             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1582
1583              /* Update vectorial force */
1584             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1585             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1586             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1587
1588             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1589             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1590             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1591
1592             /**************************
1593              * CALCULATE INTERACTIONS *
1594              **************************/
1595
1596             /* COULOMB ELECTROSTATICS */
1597             velec            = _mm_mul_ps(qq10,rinv10);
1598             felec            = _mm_mul_ps(velec,rinvsq10);
1599
1600             fscal            = felec;
1601
1602             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1603
1604              /* Update vectorial force */
1605             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1606             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1607             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1608
1609             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1610             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1611             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1612
1613             /**************************
1614              * CALCULATE INTERACTIONS *
1615              **************************/
1616
1617             /* COULOMB ELECTROSTATICS */
1618             velec            = _mm_mul_ps(qq11,rinv11);
1619             felec            = _mm_mul_ps(velec,rinvsq11);
1620
1621             fscal            = felec;
1622
1623             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1624
1625              /* Update vectorial force */
1626             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1627             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1628             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1629
1630             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1631             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1632             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1633
1634             /**************************
1635              * CALCULATE INTERACTIONS *
1636              **************************/
1637
1638             /* COULOMB ELECTROSTATICS */
1639             velec            = _mm_mul_ps(qq12,rinv12);
1640             felec            = _mm_mul_ps(velec,rinvsq12);
1641
1642             fscal            = felec;
1643
1644             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1645
1646              /* Update vectorial force */
1647             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1648             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1649             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1650
1651             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1652             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1653             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1654
1655             /**************************
1656              * CALCULATE INTERACTIONS *
1657              **************************/
1658
1659             /* COULOMB ELECTROSTATICS */
1660             velec            = _mm_mul_ps(qq20,rinv20);
1661             felec            = _mm_mul_ps(velec,rinvsq20);
1662
1663             fscal            = felec;
1664
1665             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1666
1667              /* Update vectorial force */
1668             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1669             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1670             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1671
1672             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1673             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1674             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1675
1676             /**************************
1677              * CALCULATE INTERACTIONS *
1678              **************************/
1679
1680             /* COULOMB ELECTROSTATICS */
1681             velec            = _mm_mul_ps(qq21,rinv21);
1682             felec            = _mm_mul_ps(velec,rinvsq21);
1683
1684             fscal            = felec;
1685
1686             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1687
1688              /* Update vectorial force */
1689             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1690             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1691             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1692
1693             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1694             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1695             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1696
1697             /**************************
1698              * CALCULATE INTERACTIONS *
1699              **************************/
1700
1701             /* COULOMB ELECTROSTATICS */
1702             velec            = _mm_mul_ps(qq22,rinv22);
1703             felec            = _mm_mul_ps(velec,rinvsq22);
1704
1705             fscal            = felec;
1706
1707             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1708
1709              /* Update vectorial force */
1710             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1711             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1712             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1713
1714             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1715             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1716             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1717
1718             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1719             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1720             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1721             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1722
1723             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1724                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1725
1726             /* Inner loop uses 298 flops */
1727         }
1728
1729         /* End of innermost loop */
1730
1731         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1732                                               f+i_coord_offset,fshift+i_shift_offset);
1733
1734         /* Increment number of inner iterations */
1735         inneriter                  += j_index_end - j_index_start;
1736
1737         /* Outer loop uses 18 flops */
1738     }
1739
1740     /* Increment number of outer iterations */
1741     outeriter        += nri;
1742
1743     /* Update outer/inner flops */
1744
1745     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*298);
1746 }