src/gromacs/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/nb_kernel_ElecRF_VdwNone_GeomW3W3_avx_128_fma_single.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*
  36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
  37  */
  38 #ifdef HAVE_CONFIG_H
  39 #include <config.h>
  40 #endif
  41
  42 #include <math.h>
  43
  44 #include "../nb_kernel.h"
  45 #include "types/simple.h"
  46 #include "gromacs/math/vec.h"
  47 #include "nrnb.h"
  48
  49 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
  50 #include "kernelutil_x86_avx_128_fma_single.h"
  51
  52 /*
  53  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_128_fma_single
  54  * Electrostatics interaction: ReactionField
  55  * VdW interaction:            None
  56  * Geometry:                   Water3-Water3
  57  * Calculate force/pot:        PotentialAndForce
  58  */
  59 void
  60 nb_kernel_ElecRF_VdwNone_GeomW3W3_VF_avx_128_fma_single
  61                     (t_nblist                    * gmx_restrict       nlist,
  62                      rvec                        * gmx_restrict          xx,
  63                      rvec                        * gmx_restrict          ff,
  64                      t_forcerec                  * gmx_restrict          fr,
  65                      t_mdatoms                   * gmx_restrict     mdatoms,
  66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
  67                      t_nrnb                      * gmx_restrict        nrnb)
  68 {
  69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
  70      * just 0 for non-waters.
  71      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
  72      * jnr indices corresponding to data put in the four positions in the SIMD register.
  73      */
  74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
  75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
  76     int              jnrA,jnrB,jnrC,jnrD;
  77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
  78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
  79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
  80     real             rcutoff_scalar;
  81     real             *shiftvec,*fshift,*x,*f;
  82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
  83     real             scratch[4*DIM];
  84     __m128           fscal,rcutoff,rcutoff2,jidxall;
  85     int              vdwioffset0;
  86     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
  87     int              vdwioffset1;
  88     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
  89     int              vdwioffset2;
  90     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
  91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
  92     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
  93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
  94     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
  95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
  96     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
  97     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
  98     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
  99     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
 100     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
 101     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
 102     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
 103     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
 104     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
 105     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
 106     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 107     real             *charge;
 108     __m128           dummy_mask,cutoff_mask;
 109     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 110     __m128           one     = _mm_set1_ps(1.0);
 111     __m128           two     = _mm_set1_ps(2.0);
 112     x                = xx[0];
 113     f                = ff[0];
 114
 115     nri              = nlist->nri;
 116     iinr             = nlist->iinr;
 117     jindex           = nlist->jindex;
 118     jjnr             = nlist->jjnr;
 119     shiftidx         = nlist->shift;
 120     gid              = nlist->gid;
 121     shiftvec         = fr->shift_vec[0];
 122     fshift           = fr->fshift[0];
 123     facel            = _mm_set1_ps(fr->epsfac);
 124     charge           = mdatoms->chargeA;
 125     krf              = _mm_set1_ps(fr->ic->k_rf);
 126     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
 127     crf              = _mm_set1_ps(fr->ic->c_rf);
 128
 129     /* Setup water-specific parameters */
 130     inr              = nlist->iinr[0];
 131     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
 132     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
 133     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
 134
 135     jq0              = _mm_set1_ps(charge[inr+0]);
 136     jq1              = _mm_set1_ps(charge[inr+1]);
 137     jq2              = _mm_set1_ps(charge[inr+2]);
 138     qq00             = _mm_mul_ps(iq0,jq0);
 139     qq01             = _mm_mul_ps(iq0,jq1);
 140     qq02             = _mm_mul_ps(iq0,jq2);
 141     qq10             = _mm_mul_ps(iq1,jq0);
 142     qq11             = _mm_mul_ps(iq1,jq1);
 143     qq12             = _mm_mul_ps(iq1,jq2);
 144     qq20             = _mm_mul_ps(iq2,jq0);
 145     qq21             = _mm_mul_ps(iq2,jq1);
 146     qq22             = _mm_mul_ps(iq2,jq2);
 147
 148     /* Avoid stupid compiler warnings */
 149     jnrA = jnrB = jnrC = jnrD = 0;
 150     j_coord_offsetA = 0;
 151     j_coord_offsetB = 0;
 152     j_coord_offsetC = 0;
 153     j_coord_offsetD = 0;
 154
 155     outeriter        = 0;
 156     inneriter        = 0;
 157
 158     for(iidx=0;iidx<4*DIM;iidx++)
 159     {
 160         scratch[iidx] = 0.0;
 161     }
 162
 163     /* Start outer loop over neighborlists */
 164     for(iidx=0; iidx<nri; iidx++)
 165     {
 166         /* Load shift vector for this list */
 167         i_shift_offset   = DIM*shiftidx[iidx];
 168
 169         /* Load limits for loop over neighbors */
 170         j_index_start    = jindex[iidx];
 171         j_index_end      = jindex[iidx+1];
 172
 173         /* Get outer coordinate index */
 174         inr              = iinr[iidx];
 175         i_coord_offset   = DIM*inr;
 176
 177         /* Load i particle coords and add shift vector */
 178         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
 179                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
 180
 181         fix0             = _mm_setzero_ps();
 182         fiy0             = _mm_setzero_ps();
 183         fiz0             = _mm_setzero_ps();
 184         fix1             = _mm_setzero_ps();
 185         fiy1             = _mm_setzero_ps();
 186         fiz1             = _mm_setzero_ps();
 187         fix2             = _mm_setzero_ps();
 188         fiy2             = _mm_setzero_ps();
 189         fiz2             = _mm_setzero_ps();
 190
 191         /* Reset potential sums */
 192         velecsum         = _mm_setzero_ps();
 193
 194         /* Start inner kernel loop */
 195         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 196         {
 197
 198             /* Get j neighbor index, and coordinate index */
 199             jnrA             = jjnr[jidx];
 200             jnrB             = jjnr[jidx+1];
 201             jnrC             = jjnr[jidx+2];
 202             jnrD             = jjnr[jidx+3];
 203             j_coord_offsetA  = DIM*jnrA;
 204             j_coord_offsetB  = DIM*jnrB;
 205             j_coord_offsetC  = DIM*jnrC;
 206             j_coord_offsetD  = DIM*jnrD;
 207
 208             /* load j atom coordinates */
 209             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 210                                               x+j_coord_offsetC,x+j_coord_offsetD,
 211                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 212
 213             /* Calculate displacement vector */
 214             dx00             = _mm_sub_ps(ix0,jx0);
 215             dy00             = _mm_sub_ps(iy0,jy0);
 216             dz00             = _mm_sub_ps(iz0,jz0);
 217             dx01             = _mm_sub_ps(ix0,jx1);
 218             dy01             = _mm_sub_ps(iy0,jy1);
 219             dz01             = _mm_sub_ps(iz0,jz1);
 220             dx02             = _mm_sub_ps(ix0,jx2);
 221             dy02             = _mm_sub_ps(iy0,jy2);
 222             dz02             = _mm_sub_ps(iz0,jz2);
 223             dx10             = _mm_sub_ps(ix1,jx0);
 224             dy10             = _mm_sub_ps(iy1,jy0);
 225             dz10             = _mm_sub_ps(iz1,jz0);
 226             dx11             = _mm_sub_ps(ix1,jx1);
 227             dy11             = _mm_sub_ps(iy1,jy1);
 228             dz11             = _mm_sub_ps(iz1,jz1);
 229             dx12             = _mm_sub_ps(ix1,jx2);
 230             dy12             = _mm_sub_ps(iy1,jy2);
 231             dz12             = _mm_sub_ps(iz1,jz2);
 232             dx20             = _mm_sub_ps(ix2,jx0);
 233             dy20             = _mm_sub_ps(iy2,jy0);
 234             dz20             = _mm_sub_ps(iz2,jz0);
 235             dx21             = _mm_sub_ps(ix2,jx1);
 236             dy21             = _mm_sub_ps(iy2,jy1);
 237             dz21             = _mm_sub_ps(iz2,jz1);
 238             dx22             = _mm_sub_ps(ix2,jx2);
 239             dy22             = _mm_sub_ps(iy2,jy2);
 240             dz22             = _mm_sub_ps(iz2,jz2);
 241
 242             /* Calculate squared distance and things based on it */
 243             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 244             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
 245             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
 246             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
 247             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
 248             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
 249             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
 250             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
 251             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
 252
 253             rinv00           = gmx_mm_invsqrt_ps(rsq00);
 254             rinv01           = gmx_mm_invsqrt_ps(rsq01);
 255             rinv02           = gmx_mm_invsqrt_ps(rsq02);
 256             rinv10           = gmx_mm_invsqrt_ps(rsq10);
 257             rinv11           = gmx_mm_invsqrt_ps(rsq11);
 258             rinv12           = gmx_mm_invsqrt_ps(rsq12);
 259             rinv20           = gmx_mm_invsqrt_ps(rsq20);
 260             rinv21           = gmx_mm_invsqrt_ps(rsq21);
 261             rinv22           = gmx_mm_invsqrt_ps(rsq22);
 262
 263             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 264             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
 265             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
 266             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
 267             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
 268             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
 269             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
 270             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
 271             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
 272
 273             fjx0             = _mm_setzero_ps();
 274             fjy0             = _mm_setzero_ps();
 275             fjz0             = _mm_setzero_ps();
 276             fjx1             = _mm_setzero_ps();
 277             fjy1             = _mm_setzero_ps();
 278             fjz1             = _mm_setzero_ps();
 279             fjx2             = _mm_setzero_ps();
 280             fjy2             = _mm_setzero_ps();
 281             fjz2             = _mm_setzero_ps();
 282
 283             /**************************
 284              * CALCULATE INTERACTIONS *
 285              **************************/
 286
 287             /* REACTION-FIELD ELECTROSTATICS */
 288             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_macc_ps(krf,rsq00,rinv00),crf));
 289             felec            = _mm_mul_ps(qq00,_mm_msub_ps(rinv00,rinvsq00,krf2));
 290
 291             /* Update potential sum for this i atom from the interaction with this j atom. */
 292             velecsum         = _mm_add_ps(velecsum,velec);
 293
 294             fscal            = felec;
 295
 296              /* Update vectorial force */
 297             fix0             = _mm_macc_ps(dx00,fscal,fix0);
 298             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 299             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 300
 301             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
 302             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
 303             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 304
 305             /**************************
 306              * CALCULATE INTERACTIONS *
 307              **************************/
 308
 309             /* REACTION-FIELD ELECTROSTATICS */
 310             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_macc_ps(krf,rsq01,rinv01),crf));
 311             felec            = _mm_mul_ps(qq01,_mm_msub_ps(rinv01,rinvsq01,krf2));
 312
 313             /* Update potential sum for this i atom from the interaction with this j atom. */
 314             velecsum         = _mm_add_ps(velecsum,velec);
 315
 316             fscal            = felec;
 317
 318              /* Update vectorial force */
 319             fix0             = _mm_macc_ps(dx01,fscal,fix0);
 320             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
 321             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
 322
 323             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
 324             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
 325             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
 326
 327             /**************************
 328              * CALCULATE INTERACTIONS *
 329              **************************/
 330
 331             /* REACTION-FIELD ELECTROSTATICS */
 332             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_macc_ps(krf,rsq02,rinv02),crf));
 333             felec            = _mm_mul_ps(qq02,_mm_msub_ps(rinv02,rinvsq02,krf2));
 334
 335             /* Update potential sum for this i atom from the interaction with this j atom. */
 336             velecsum         = _mm_add_ps(velecsum,velec);
 337
 338             fscal            = felec;
 339
 340              /* Update vectorial force */
 341             fix0             = _mm_macc_ps(dx02,fscal,fix0);
 342             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
 343             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
 344
 345             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
 346             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
 347             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
 348
 349             /**************************
 350              * CALCULATE INTERACTIONS *
 351              **************************/
 352
 353             /* REACTION-FIELD ELECTROSTATICS */
 354             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_macc_ps(krf,rsq10,rinv10),crf));
 355             felec            = _mm_mul_ps(qq10,_mm_msub_ps(rinv10,rinvsq10,krf2));
 356
 357             /* Update potential sum for this i atom from the interaction with this j atom. */
 358             velecsum         = _mm_add_ps(velecsum,velec);
 359
 360             fscal            = felec;
 361
 362              /* Update vectorial force */
 363             fix1             = _mm_macc_ps(dx10,fscal,fix1);
 364             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
 365             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 366
 367             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
 368             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
 369             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 370
 371             /**************************
 372              * CALCULATE INTERACTIONS *
 373              **************************/
 374
 375             /* REACTION-FIELD ELECTROSTATICS */
 376             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_macc_ps(krf,rsq11,rinv11),crf));
 377             felec            = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
 378
 379             /* Update potential sum for this i atom from the interaction with this j atom. */
 380             velecsum         = _mm_add_ps(velecsum,velec);
 381
 382             fscal            = felec;
 383
 384              /* Update vectorial force */
 385             fix1             = _mm_macc_ps(dx11,fscal,fix1);
 386             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
 387             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
 388
 389             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
 390             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
 391             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
 392
 393             /**************************
 394              * CALCULATE INTERACTIONS *
 395              **************************/
 396
 397             /* REACTION-FIELD ELECTROSTATICS */
 398             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_macc_ps(krf,rsq12,rinv12),crf));
 399             felec            = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
 400
 401             /* Update potential sum for this i atom from the interaction with this j atom. */
 402             velecsum         = _mm_add_ps(velecsum,velec);
 403
 404             fscal            = felec;
 405
 406              /* Update vectorial force */
 407             fix1             = _mm_macc_ps(dx12,fscal,fix1);
 408             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
 409             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
 410
 411             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
 412             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
 413             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
 414
 415             /**************************
 416              * CALCULATE INTERACTIONS *
 417              **************************/
 418
 419             /* REACTION-FIELD ELECTROSTATICS */
 420             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_macc_ps(krf,rsq20,rinv20),crf));
 421             felec            = _mm_mul_ps(qq20,_mm_msub_ps(rinv20,rinvsq20,krf2));
 422
 423             /* Update potential sum for this i atom from the interaction with this j atom. */
 424             velecsum         = _mm_add_ps(velecsum,velec);
 425
 426             fscal            = felec;
 427
 428              /* Update vectorial force */
 429             fix2             = _mm_macc_ps(dx20,fscal,fix2);
 430             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
 431             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 432
 433             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
 434             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
 435             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 436
 437             /**************************
 438              * CALCULATE INTERACTIONS *
 439              **************************/
 440
 441             /* REACTION-FIELD ELECTROSTATICS */
 442             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_macc_ps(krf,rsq21,rinv21),crf));
 443             felec            = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
 444
 445             /* Update potential sum for this i atom from the interaction with this j atom. */
 446             velecsum         = _mm_add_ps(velecsum,velec);
 447
 448             fscal            = felec;
 449
 450              /* Update vectorial force */
 451             fix2             = _mm_macc_ps(dx21,fscal,fix2);
 452             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
 453             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
 454
 455             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
 456             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
 457             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
 458
 459             /**************************
 460              * CALCULATE INTERACTIONS *
 461              **************************/
 462
 463             /* REACTION-FIELD ELECTROSTATICS */
 464             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_macc_ps(krf,rsq22,rinv22),crf));
 465             felec            = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
 466
 467             /* Update potential sum for this i atom from the interaction with this j atom. */
 468             velecsum         = _mm_add_ps(velecsum,velec);
 469
 470             fscal            = felec;
 471
 472              /* Update vectorial force */
 473             fix2             = _mm_macc_ps(dx22,fscal,fix2);
 474             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
 475             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
 476
 477             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
 478             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
 479             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
 480
 481             fjptrA             = f+j_coord_offsetA;
 482             fjptrB             = f+j_coord_offsetB;
 483             fjptrC             = f+j_coord_offsetC;
 484             fjptrD             = f+j_coord_offsetD;
 485
 486             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 487                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 488
 489             /* Inner loop uses 315 flops */
 490         }
 491
 492         if(jidx<j_index_end)
 493         {
 494
 495             /* Get j neighbor index, and coordinate index */
 496             jnrlistA         = jjnr[jidx];
 497             jnrlistB         = jjnr[jidx+1];
 498             jnrlistC         = jjnr[jidx+2];
 499             jnrlistD         = jjnr[jidx+3];
 500             /* Sign of each element will be negative for non-real atoms.
 501              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
 502              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
 503              */
 504             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
 505             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
 506             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
 507             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
 508             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
 509             j_coord_offsetA  = DIM*jnrA;
 510             j_coord_offsetB  = DIM*jnrB;
 511             j_coord_offsetC  = DIM*jnrC;
 512             j_coord_offsetD  = DIM*jnrD;
 513
 514             /* load j atom coordinates */
 515             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
 516                                               x+j_coord_offsetC,x+j_coord_offsetD,
 517                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
 518
 519             /* Calculate displacement vector */
 520             dx00             = _mm_sub_ps(ix0,jx0);
 521             dy00             = _mm_sub_ps(iy0,jy0);
 522             dz00             = _mm_sub_ps(iz0,jz0);
 523             dx01             = _mm_sub_ps(ix0,jx1);
 524             dy01             = _mm_sub_ps(iy0,jy1);
 525             dz01             = _mm_sub_ps(iz0,jz1);
 526             dx02             = _mm_sub_ps(ix0,jx2);
 527             dy02             = _mm_sub_ps(iy0,jy2);
 528             dz02             = _mm_sub_ps(iz0,jz2);
 529             dx10             = _mm_sub_ps(ix1,jx0);
 530             dy10             = _mm_sub_ps(iy1,jy0);
 531             dz10             = _mm_sub_ps(iz1,jz0);
 532             dx11             = _mm_sub_ps(ix1,jx1);
 533             dy11             = _mm_sub_ps(iy1,jy1);
 534             dz11             = _mm_sub_ps(iz1,jz1);
 535             dx12             = _mm_sub_ps(ix1,jx2);
 536             dy12             = _mm_sub_ps(iy1,jy2);
 537             dz12             = _mm_sub_ps(iz1,jz2);
 538             dx20             = _mm_sub_ps(ix2,jx0);
 539             dy20             = _mm_sub_ps(iy2,jy0);
 540             dz20             = _mm_sub_ps(iz2,jz0);
 541             dx21             = _mm_sub_ps(ix2,jx1);
 542             dy21             = _mm_sub_ps(iy2,jy1);
 543             dz21             = _mm_sub_ps(iz2,jz1);
 544             dx22             = _mm_sub_ps(ix2,jx2);
 545             dy22             = _mm_sub_ps(iy2,jy2);
 546             dz22             = _mm_sub_ps(iz2,jz2);
 547
 548             /* Calculate squared distance and things based on it */
 549             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
 550             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
 551             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
 552             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
 553             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
 554             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
 555             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
 556             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
 557             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
 558
 559             rinv00           = gmx_mm_invsqrt_ps(rsq00);
 560             rinv01           = gmx_mm_invsqrt_ps(rsq01);
 561             rinv02           = gmx_mm_invsqrt_ps(rsq02);
 562             rinv10           = gmx_mm_invsqrt_ps(rsq10);
 563             rinv11           = gmx_mm_invsqrt_ps(rsq11);
 564             rinv12           = gmx_mm_invsqrt_ps(rsq12);
 565             rinv20           = gmx_mm_invsqrt_ps(rsq20);
 566             rinv21           = gmx_mm_invsqrt_ps(rsq21);
 567             rinv22           = gmx_mm_invsqrt_ps(rsq22);
 568
 569             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
 570             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
 571             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
 572             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
 573             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
 574             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
 575             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
 576             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
 577             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
 578
 579             fjx0             = _mm_setzero_ps();
 580             fjy0             = _mm_setzero_ps();
 581             fjz0             = _mm_setzero_ps();
 582             fjx1             = _mm_setzero_ps();
 583             fjy1             = _mm_setzero_ps();
 584             fjz1             = _mm_setzero_ps();
 585             fjx2             = _mm_setzero_ps();
 586             fjy2             = _mm_setzero_ps();
 587             fjz2             = _mm_setzero_ps();
 588
 589             /**************************
 590              * CALCULATE INTERACTIONS *
 591              **************************/
 592
 593             /* REACTION-FIELD ELECTROSTATICS */
 594             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_macc_ps(krf,rsq00,rinv00),crf));
 595             felec            = _mm_mul_ps(qq00,_mm_msub_ps(rinv00,rinvsq00,krf2));
 596
 597             /* Update potential sum for this i atom from the interaction with this j atom. */
 598             velec            = _mm_andnot_ps(dummy_mask,velec);
 599             velecsum         = _mm_add_ps(velecsum,velec);
 600
 601             fscal            = felec;
 602
 603             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 604
 605              /* Update vectorial force */
 606             fix0             = _mm_macc_ps(dx00,fscal,fix0);
 607             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
 608             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
 609
 610             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
 611             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
 612             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
 613
 614             /**************************
 615              * CALCULATE INTERACTIONS *
 616              **************************/
 617
 618             /* REACTION-FIELD ELECTROSTATICS */
 619             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_macc_ps(krf,rsq01,rinv01),crf));
 620             felec            = _mm_mul_ps(qq01,_mm_msub_ps(rinv01,rinvsq01,krf2));
 621
 622             /* Update potential sum for this i atom from the interaction with this j atom. */
 623             velec            = _mm_andnot_ps(dummy_mask,velec);
 624             velecsum         = _mm_add_ps(velecsum,velec);
 625
 626             fscal            = felec;
 627
 628             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 629
 630              /* Update vectorial force */
 631             fix0             = _mm_macc_ps(dx01,fscal,fix0);
 632             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
 633             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
 634
 635             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
 636             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
 637             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
 638
 639             /**************************
 640              * CALCULATE INTERACTIONS *
 641              **************************/
 642
 643             /* REACTION-FIELD ELECTROSTATICS */
 644             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_macc_ps(krf,rsq02,rinv02),crf));
 645             felec            = _mm_mul_ps(qq02,_mm_msub_ps(rinv02,rinvsq02,krf2));
 646
 647             /* Update potential sum for this i atom from the interaction with this j atom. */
 648             velec            = _mm_andnot_ps(dummy_mask,velec);
 649             velecsum         = _mm_add_ps(velecsum,velec);
 650
 651             fscal            = felec;
 652
 653             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 654
 655              /* Update vectorial force */
 656             fix0             = _mm_macc_ps(dx02,fscal,fix0);
 657             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
 658             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
 659
 660             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
 661             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
 662             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
 663
 664             /**************************
 665              * CALCULATE INTERACTIONS *
 666              **************************/
 667
 668             /* REACTION-FIELD ELECTROSTATICS */
 669             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_macc_ps(krf,rsq10,rinv10),crf));
 670             felec            = _mm_mul_ps(qq10,_mm_msub_ps(rinv10,rinvsq10,krf2));
 671
 672             /* Update potential sum for this i atom from the interaction with this j atom. */
 673             velec            = _mm_andnot_ps(dummy_mask,velec);
 674             velecsum         = _mm_add_ps(velecsum,velec);
 675
 676             fscal            = felec;
 677
 678             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 679
 680              /* Update vectorial force */
 681             fix1             = _mm_macc_ps(dx10,fscal,fix1);
 682             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
 683             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
 684
 685             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
 686             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
 687             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
 688
 689             /**************************
 690              * CALCULATE INTERACTIONS *
 691              **************************/
 692
 693             /* REACTION-FIELD ELECTROSTATICS */
 694             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_macc_ps(krf,rsq11,rinv11),crf));
 695             felec            = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
 696
 697             /* Update potential sum for this i atom from the interaction with this j atom. */
 698             velec            = _mm_andnot_ps(dummy_mask,velec);
 699             velecsum         = _mm_add_ps(velecsum,velec);
 700
 701             fscal            = felec;
 702
 703             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 704
 705              /* Update vectorial force */
 706             fix1             = _mm_macc_ps(dx11,fscal,fix1);
 707             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
 708             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
 709
 710             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
 711             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
 712             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
 713
 714             /**************************
 715              * CALCULATE INTERACTIONS *
 716              **************************/
 717
 718             /* REACTION-FIELD ELECTROSTATICS */
 719             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_macc_ps(krf,rsq12,rinv12),crf));
 720             felec            = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
 721
 722             /* Update potential sum for this i atom from the interaction with this j atom. */
 723             velec            = _mm_andnot_ps(dummy_mask,velec);
 724             velecsum         = _mm_add_ps(velecsum,velec);
 725
 726             fscal            = felec;
 727
 728             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 729
 730              /* Update vectorial force */
 731             fix1             = _mm_macc_ps(dx12,fscal,fix1);
 732             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
 733             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
 734
 735             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
 736             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
 737             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
 738
 739             /**************************
 740              * CALCULATE INTERACTIONS *
 741              **************************/
 742
 743             /* REACTION-FIELD ELECTROSTATICS */
 744             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_macc_ps(krf,rsq20,rinv20),crf));
 745             felec            = _mm_mul_ps(qq20,_mm_msub_ps(rinv20,rinvsq20,krf2));
 746
 747             /* Update potential sum for this i atom from the interaction with this j atom. */
 748             velec            = _mm_andnot_ps(dummy_mask,velec);
 749             velecsum         = _mm_add_ps(velecsum,velec);
 750
 751             fscal            = felec;
 752
 753             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 754
 755              /* Update vectorial force */
 756             fix2             = _mm_macc_ps(dx20,fscal,fix2);
 757             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
 758             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
 759
 760             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
 761             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
 762             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
 763
 764             /**************************
 765              * CALCULATE INTERACTIONS *
 766              **************************/
 767
 768             /* REACTION-FIELD ELECTROSTATICS */
 769             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_macc_ps(krf,rsq21,rinv21),crf));
 770             felec            = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
 771
 772             /* Update potential sum for this i atom from the interaction with this j atom. */
 773             velec            = _mm_andnot_ps(dummy_mask,velec);
 774             velecsum         = _mm_add_ps(velecsum,velec);
 775
 776             fscal            = felec;
 777
 778             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 779
 780              /* Update vectorial force */
 781             fix2             = _mm_macc_ps(dx21,fscal,fix2);
 782             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
 783             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
 784
 785             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
 786             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
 787             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
 788
 789             /**************************
 790              * CALCULATE INTERACTIONS *
 791              **************************/
 792
 793             /* REACTION-FIELD ELECTROSTATICS */
 794             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_macc_ps(krf,rsq22,rinv22),crf));
 795             felec            = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
 796
 797             /* Update potential sum for this i atom from the interaction with this j atom. */
 798             velec            = _mm_andnot_ps(dummy_mask,velec);
 799             velecsum         = _mm_add_ps(velecsum,velec);
 800
 801             fscal            = felec;
 802
 803             fscal            = _mm_andnot_ps(dummy_mask,fscal);
 804
 805              /* Update vectorial force */
 806             fix2             = _mm_macc_ps(dx22,fscal,fix2);
 807             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
 808             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
 809
 810             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
 811             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
 812             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
 813
 814             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
 815             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
 816             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
 817             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
 818
 819             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
 820                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
 821
 822             /* Inner loop uses 315 flops */
 823         }
 824
 825         /* End of innermost loop */
 826
 827         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
 828                                               f+i_coord_offset,fshift+i_shift_offset);
 829
 830         ggid                        = gid[iidx];
 831         /* Update potential energies */
 832         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
 833
 834         /* Increment number of inner iterations */
 835         inneriter                  += j_index_end - j_index_start;
 836
 837         /* Outer loop uses 19 flops */
 838     }
 839
 840     /* Increment number of outer iterations */
 841     outeriter        += nri;
 842
 843     /* Update outer/inner flops */
 844
 845     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*315);
 846 }
 847 /*
 848  * Gromacs nonbonded kernel:   nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_128_fma_single
 849  * Electrostatics interaction: ReactionField
 850  * VdW interaction:            None
 851  * Geometry:                   Water3-Water3
 852  * Calculate force/pot:        Force
 853  */
 854 void
 855 nb_kernel_ElecRF_VdwNone_GeomW3W3_F_avx_128_fma_single
 856                     (t_nblist                    * gmx_restrict       nlist,
 857                      rvec                        * gmx_restrict          xx,
 858                      rvec                        * gmx_restrict          ff,
 859                      t_forcerec                  * gmx_restrict          fr,
 860                      t_mdatoms                   * gmx_restrict     mdatoms,
 861                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
 862                      t_nrnb                      * gmx_restrict        nrnb)
 863 {
 864     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
 865      * just 0 for non-waters.
 866      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
 867      * jnr indices corresponding to data put in the four positions in the SIMD register.
 868      */
 869     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
 870     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
 871     int              jnrA,jnrB,jnrC,jnrD;
 872     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
 873     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
 874     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
 875     real             rcutoff_scalar;
 876     real             *shiftvec,*fshift,*x,*f;
 877     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
 878     real             scratch[4*DIM];
 879     __m128           fscal,rcutoff,rcutoff2,jidxall;
 880     int              vdwioffset0;
 881     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
 882     int              vdwioffset1;
 883     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
 884     int              vdwioffset2;
 885     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
 886     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
 887     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
 888     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
 889     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
 890     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
 891     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
 892     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
 893     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
 894     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
 895     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
 896     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
 897     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
 898     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
 899     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
 900     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
 901     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
 902     real             *charge;
 903     __m128           dummy_mask,cutoff_mask;
 904     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
 905     __m128           one     = _mm_set1_ps(1.0);
 906     __m128           two     = _mm_set1_ps(2.0);
 907     x                = xx[0];
 908     f                = ff[0];
 909
 910     nri              = nlist->nri;
 911     iinr             = nlist->iinr;
 912     jindex           = nlist->jindex;
 913     jjnr             = nlist->jjnr;
 914     shiftidx         = nlist->shift;
 915     gid              = nlist->gid;
 916     shiftvec         = fr->shift_vec[0];
 917     fshift           = fr->fshift[0];
 918     facel            = _mm_set1_ps(fr->epsfac);
 919     charge           = mdatoms->chargeA;
 920     krf              = _mm_set1_ps(fr->ic->k_rf);
 921     krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
 922     crf              = _mm_set1_ps(fr->ic->c_rf);
 923
 924     /* Setup water-specific parameters */
 925     inr              = nlist->iinr[0];
 926     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
 927     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
 928     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
 929
 930     jq0              = _mm_set1_ps(charge[inr+0]);
 931     jq1              = _mm_set1_ps(charge[inr+1]);
 932     jq2              = _mm_set1_ps(charge[inr+2]);
 933     qq00             = _mm_mul_ps(iq0,jq0);
 934     qq01             = _mm_mul_ps(iq0,jq1);
 935     qq02             = _mm_mul_ps(iq0,jq2);
 936     qq10             = _mm_mul_ps(iq1,jq0);
 937     qq11             = _mm_mul_ps(iq1,jq1);
 938     qq12             = _mm_mul_ps(iq1,jq2);
 939     qq20             = _mm_mul_ps(iq2,jq0);
 940     qq21             = _mm_mul_ps(iq2,jq1);
 941     qq22             = _mm_mul_ps(iq2,jq2);
 942
 943     /* Avoid stupid compiler warnings */
 944     jnrA = jnrB = jnrC = jnrD = 0;
 945     j_coord_offsetA = 0;
 946     j_coord_offsetB = 0;
 947     j_coord_offsetC = 0;
 948     j_coord_offsetD = 0;
 949
 950     outeriter        = 0;
 951     inneriter        = 0;
 952
 953     for(iidx=0;iidx<4*DIM;iidx++)
 954     {
 955         scratch[iidx] = 0.0;
 956     }
 957
 958     /* Start outer loop over neighborlists */
 959     for(iidx=0; iidx<nri; iidx++)
 960     {
 961         /* Load shift vector for this list */
 962         i_shift_offset   = DIM*shiftidx[iidx];
 963
 964         /* Load limits for loop over neighbors */
 965         j_index_start    = jindex[iidx];
 966         j_index_end      = jindex[iidx+1];
 967
 968         /* Get outer coordinate index */
 969         inr              = iinr[iidx];
 970         i_coord_offset   = DIM*inr;
 971
 972         /* Load i particle coords and add shift vector */
 973         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
 974                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
 975
 976         fix0             = _mm_setzero_ps();
 977         fiy0             = _mm_setzero_ps();
 978         fiz0             = _mm_setzero_ps();
 979         fix1             = _mm_setzero_ps();
 980         fiy1             = _mm_setzero_ps();
 981         fiz1             = _mm_setzero_ps();
 982         fix2             = _mm_setzero_ps();
 983         fiy2             = _mm_setzero_ps();
 984         fiz2             = _mm_setzero_ps();
 985
 986         /* Start inner kernel loop */
 987         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
 988         {
 989
 990             /* Get j neighbor index, and coordinate index */
 991             jnrA             = jjnr[jidx];
 992             jnrB             = jjnr[jidx+1];
 993             jnrC             = jjnr[jidx+2];
 994             jnrD             = jjnr[jidx+3];
 995             j_coord_offsetA  = DIM*jnrA;
 996             j_coord_offsetB  = DIM*jnrB;
 997             j_coord_offsetC  = DIM*jnrC;
 998             j_coord_offsetD  = DIM*jnrD;
 999
1000             /* load j atom coordinates */
1001             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1002                                               x+j_coord_offsetC,x+j_coord_offsetD,
1003                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1004
1005             /* Calculate displacement vector */
1006             dx00             = _mm_sub_ps(ix0,jx0);
1007             dy00             = _mm_sub_ps(iy0,jy0);
1008             dz00             = _mm_sub_ps(iz0,jz0);
1009             dx01             = _mm_sub_ps(ix0,jx1);
1010             dy01             = _mm_sub_ps(iy0,jy1);
1011             dz01             = _mm_sub_ps(iz0,jz1);
1012             dx02             = _mm_sub_ps(ix0,jx2);
1013             dy02             = _mm_sub_ps(iy0,jy2);
1014             dz02             = _mm_sub_ps(iz0,jz2);
1015             dx10             = _mm_sub_ps(ix1,jx0);
1016             dy10             = _mm_sub_ps(iy1,jy0);
1017             dz10             = _mm_sub_ps(iz1,jz0);
1018             dx11             = _mm_sub_ps(ix1,jx1);
1019             dy11             = _mm_sub_ps(iy1,jy1);
1020             dz11             = _mm_sub_ps(iz1,jz1);
1021             dx12             = _mm_sub_ps(ix1,jx2);
1022             dy12             = _mm_sub_ps(iy1,jy2);
1023             dz12             = _mm_sub_ps(iz1,jz2);
1024             dx20             = _mm_sub_ps(ix2,jx0);
1025             dy20             = _mm_sub_ps(iy2,jy0);
1026             dz20             = _mm_sub_ps(iz2,jz0);
1027             dx21             = _mm_sub_ps(ix2,jx1);
1028             dy21             = _mm_sub_ps(iy2,jy1);
1029             dz21             = _mm_sub_ps(iz2,jz1);
1030             dx22             = _mm_sub_ps(ix2,jx2);
1031             dy22             = _mm_sub_ps(iy2,jy2);
1032             dz22             = _mm_sub_ps(iz2,jz2);
1033
1034             /* Calculate squared distance and things based on it */
1035             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1036             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1037             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1038             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1039             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1040             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1041             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1042             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1043             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1044
1045             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1046             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1047             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1048             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1049             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1050             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1051             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1052             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1053             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1054
1055             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1056             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1057             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1058             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1059             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1060             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1061             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1062             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1063             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1064
1065             fjx0             = _mm_setzero_ps();
1066             fjy0             = _mm_setzero_ps();
1067             fjz0             = _mm_setzero_ps();
1068             fjx1             = _mm_setzero_ps();
1069             fjy1             = _mm_setzero_ps();
1070             fjz1             = _mm_setzero_ps();
1071             fjx2             = _mm_setzero_ps();
1072             fjy2             = _mm_setzero_ps();
1073             fjz2             = _mm_setzero_ps();
1074
1075             /**************************
1076              * CALCULATE INTERACTIONS *
1077              **************************/
1078
1079             /* REACTION-FIELD ELECTROSTATICS */
1080             felec            = _mm_mul_ps(qq00,_mm_msub_ps(rinv00,rinvsq00,krf2));
1081
1082             fscal            = felec;
1083
1084              /* Update vectorial force */
1085             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1086             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1087             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1088
1089             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1090             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1091             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1092
1093             /**************************
1094              * CALCULATE INTERACTIONS *
1095              **************************/
1096
1097             /* REACTION-FIELD ELECTROSTATICS */
1098             felec            = _mm_mul_ps(qq01,_mm_msub_ps(rinv01,rinvsq01,krf2));
1099
1100             fscal            = felec;
1101
1102              /* Update vectorial force */
1103             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1104             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1105             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1106
1107             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1108             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1109             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1110
1111             /**************************
1112              * CALCULATE INTERACTIONS *
1113              **************************/
1114
1115             /* REACTION-FIELD ELECTROSTATICS */
1116             felec            = _mm_mul_ps(qq02,_mm_msub_ps(rinv02,rinvsq02,krf2));
1117
1118             fscal            = felec;
1119
1120              /* Update vectorial force */
1121             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1122             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1123             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1124
1125             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1126             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1127             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1128
1129             /**************************
1130              * CALCULATE INTERACTIONS *
1131              **************************/
1132
1133             /* REACTION-FIELD ELECTROSTATICS */
1134             felec            = _mm_mul_ps(qq10,_mm_msub_ps(rinv10,rinvsq10,krf2));
1135
1136             fscal            = felec;
1137
1138              /* Update vectorial force */
1139             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1140             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1141             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1142
1143             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1144             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1145             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1146
1147             /**************************
1148              * CALCULATE INTERACTIONS *
1149              **************************/
1150
1151             /* REACTION-FIELD ELECTROSTATICS */
1152             felec            = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
1153
1154             fscal            = felec;
1155
1156              /* Update vectorial force */
1157             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1158             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1159             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1160
1161             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1162             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1163             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1164
1165             /**************************
1166              * CALCULATE INTERACTIONS *
1167              **************************/
1168
1169             /* REACTION-FIELD ELECTROSTATICS */
1170             felec            = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
1171
1172             fscal            = felec;
1173
1174              /* Update vectorial force */
1175             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1176             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1177             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1178
1179             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1180             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1181             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1182
1183             /**************************
1184              * CALCULATE INTERACTIONS *
1185              **************************/
1186
1187             /* REACTION-FIELD ELECTROSTATICS */
1188             felec            = _mm_mul_ps(qq20,_mm_msub_ps(rinv20,rinvsq20,krf2));
1189
1190             fscal            = felec;
1191
1192              /* Update vectorial force */
1193             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1194             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1195             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1196
1197             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1198             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1199             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1200
1201             /**************************
1202              * CALCULATE INTERACTIONS *
1203              **************************/
1204
1205             /* REACTION-FIELD ELECTROSTATICS */
1206             felec            = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
1207
1208             fscal            = felec;
1209
1210              /* Update vectorial force */
1211             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1212             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1213             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1214
1215             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1216             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1217             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1218
1219             /**************************
1220              * CALCULATE INTERACTIONS *
1221              **************************/
1222
1223             /* REACTION-FIELD ELECTROSTATICS */
1224             felec            = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
1225
1226             fscal            = felec;
1227
1228              /* Update vectorial force */
1229             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1230             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1231             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1232
1233             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1234             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1235             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1236
1237             fjptrA             = f+j_coord_offsetA;
1238             fjptrB             = f+j_coord_offsetB;
1239             fjptrC             = f+j_coord_offsetC;
1240             fjptrD             = f+j_coord_offsetD;
1241
1242             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1243                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1244
1245             /* Inner loop uses 270 flops */
1246         }
1247
1248         if(jidx<j_index_end)
1249         {
1250
1251             /* Get j neighbor index, and coordinate index */
1252             jnrlistA         = jjnr[jidx];
1253             jnrlistB         = jjnr[jidx+1];
1254             jnrlistC         = jjnr[jidx+2];
1255             jnrlistD         = jjnr[jidx+3];
1256             /* Sign of each element will be negative for non-real atoms.
1257              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1258              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1259              */
1260             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1261             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1262             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1263             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1264             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1265             j_coord_offsetA  = DIM*jnrA;
1266             j_coord_offsetB  = DIM*jnrB;
1267             j_coord_offsetC  = DIM*jnrC;
1268             j_coord_offsetD  = DIM*jnrD;
1269
1270             /* load j atom coordinates */
1271             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1272                                               x+j_coord_offsetC,x+j_coord_offsetD,
1273                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1274
1275             /* Calculate displacement vector */
1276             dx00             = _mm_sub_ps(ix0,jx0);
1277             dy00             = _mm_sub_ps(iy0,jy0);
1278             dz00             = _mm_sub_ps(iz0,jz0);
1279             dx01             = _mm_sub_ps(ix0,jx1);
1280             dy01             = _mm_sub_ps(iy0,jy1);
1281             dz01             = _mm_sub_ps(iz0,jz1);
1282             dx02             = _mm_sub_ps(ix0,jx2);
1283             dy02             = _mm_sub_ps(iy0,jy2);
1284             dz02             = _mm_sub_ps(iz0,jz2);
1285             dx10             = _mm_sub_ps(ix1,jx0);
1286             dy10             = _mm_sub_ps(iy1,jy0);
1287             dz10             = _mm_sub_ps(iz1,jz0);
1288             dx11             = _mm_sub_ps(ix1,jx1);
1289             dy11             = _mm_sub_ps(iy1,jy1);
1290             dz11             = _mm_sub_ps(iz1,jz1);
1291             dx12             = _mm_sub_ps(ix1,jx2);
1292             dy12             = _mm_sub_ps(iy1,jy2);
1293             dz12             = _mm_sub_ps(iz1,jz2);
1294             dx20             = _mm_sub_ps(ix2,jx0);
1295             dy20             = _mm_sub_ps(iy2,jy0);
1296             dz20             = _mm_sub_ps(iz2,jz0);
1297             dx21             = _mm_sub_ps(ix2,jx1);
1298             dy21             = _mm_sub_ps(iy2,jy1);
1299             dz21             = _mm_sub_ps(iz2,jz1);
1300             dx22             = _mm_sub_ps(ix2,jx2);
1301             dy22             = _mm_sub_ps(iy2,jy2);
1302             dz22             = _mm_sub_ps(iz2,jz2);
1303
1304             /* Calculate squared distance and things based on it */
1305             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1306             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1307             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1308             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1309             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1310             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1311             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1312             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1313             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1314
1315             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1316             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1317             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1318             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1319             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1320             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1321             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1322             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1323             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1324
1325             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1326             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1327             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1328             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1329             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1330             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1331             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1332             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1333             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1334
1335             fjx0             = _mm_setzero_ps();
1336             fjy0             = _mm_setzero_ps();
1337             fjz0             = _mm_setzero_ps();
1338             fjx1             = _mm_setzero_ps();
1339             fjy1             = _mm_setzero_ps();
1340             fjz1             = _mm_setzero_ps();
1341             fjx2             = _mm_setzero_ps();
1342             fjy2             = _mm_setzero_ps();
1343             fjz2             = _mm_setzero_ps();
1344
1345             /**************************
1346              * CALCULATE INTERACTIONS *
1347              **************************/
1348
1349             /* REACTION-FIELD ELECTROSTATICS */
1350             felec            = _mm_mul_ps(qq00,_mm_msub_ps(rinv00,rinvsq00,krf2));
1351
1352             fscal            = felec;
1353
1354             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1355
1356              /* Update vectorial force */
1357             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1358             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1359             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1360
1361             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1362             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1363             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1364
1365             /**************************
1366              * CALCULATE INTERACTIONS *
1367              **************************/
1368
1369             /* REACTION-FIELD ELECTROSTATICS */
1370             felec            = _mm_mul_ps(qq01,_mm_msub_ps(rinv01,rinvsq01,krf2));
1371
1372             fscal            = felec;
1373
1374             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1375
1376              /* Update vectorial force */
1377             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1378             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1379             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1380
1381             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1382             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1383             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1384
1385             /**************************
1386              * CALCULATE INTERACTIONS *
1387              **************************/
1388
1389             /* REACTION-FIELD ELECTROSTATICS */
1390             felec            = _mm_mul_ps(qq02,_mm_msub_ps(rinv02,rinvsq02,krf2));
1391
1392             fscal            = felec;
1393
1394             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1395
1396              /* Update vectorial force */
1397             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1398             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1399             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1400
1401             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1402             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1403             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1404
1405             /**************************
1406              * CALCULATE INTERACTIONS *
1407              **************************/
1408
1409             /* REACTION-FIELD ELECTROSTATICS */
1410             felec            = _mm_mul_ps(qq10,_mm_msub_ps(rinv10,rinvsq10,krf2));
1411
1412             fscal            = felec;
1413
1414             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1415
1416              /* Update vectorial force */
1417             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1418             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1419             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1420
1421             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1422             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1423             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1424
1425             /**************************
1426              * CALCULATE INTERACTIONS *
1427              **************************/
1428
1429             /* REACTION-FIELD ELECTROSTATICS */
1430             felec            = _mm_mul_ps(qq11,_mm_msub_ps(rinv11,rinvsq11,krf2));
1431
1432             fscal            = felec;
1433
1434             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1435
1436              /* Update vectorial force */
1437             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1438             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1439             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1440
1441             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1442             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1443             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1444
1445             /**************************
1446              * CALCULATE INTERACTIONS *
1447              **************************/
1448
1449             /* REACTION-FIELD ELECTROSTATICS */
1450             felec            = _mm_mul_ps(qq12,_mm_msub_ps(rinv12,rinvsq12,krf2));
1451
1452             fscal            = felec;
1453
1454             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1455
1456              /* Update vectorial force */
1457             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1458             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1459             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1460
1461             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1462             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1463             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1464
1465             /**************************
1466              * CALCULATE INTERACTIONS *
1467              **************************/
1468
1469             /* REACTION-FIELD ELECTROSTATICS */
1470             felec            = _mm_mul_ps(qq20,_mm_msub_ps(rinv20,rinvsq20,krf2));
1471
1472             fscal            = felec;
1473
1474             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1475
1476              /* Update vectorial force */
1477             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1478             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1479             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1480
1481             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1482             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1483             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1484
1485             /**************************
1486              * CALCULATE INTERACTIONS *
1487              **************************/
1488
1489             /* REACTION-FIELD ELECTROSTATICS */
1490             felec            = _mm_mul_ps(qq21,_mm_msub_ps(rinv21,rinvsq21,krf2));
1491
1492             fscal            = felec;
1493
1494             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1495
1496              /* Update vectorial force */
1497             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1498             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1499             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1500
1501             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1502             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1503             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1504
1505             /**************************
1506              * CALCULATE INTERACTIONS *
1507              **************************/
1508
1509             /* REACTION-FIELD ELECTROSTATICS */
1510             felec            = _mm_mul_ps(qq22,_mm_msub_ps(rinv22,rinvsq22,krf2));
1511
1512             fscal            = felec;
1513
1514             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1515
1516              /* Update vectorial force */
1517             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1518             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1519             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1520
1521             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1522             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1523             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1524
1525             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1526             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1527             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1528             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1529
1530             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1531                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1532
1533             /* Inner loop uses 270 flops */
1534         }
1535
1536         /* End of innermost loop */
1537
1538         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1539                                               f+i_coord_offset,fshift+i_shift_offset);
1540
1541         /* Increment number of inner iterations */
1542         inneriter                  += j_index_end - j_index_start;
1543
1544         /* Outer loop uses 18 flops */
1545     }
1546
1547     /* Increment number of outer iterations */
1548     outeriter        += nri;
1549
1550     /* Update outer/inner flops */
1551
1552     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*270);
1553 }