f8cc427786cffbe9caa351d0b2ddf8eac70e97d8
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_avx_128_fma_single.c
1 /*
2  * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_128_fma_single.h"
34 #include "kernelutil_x86_avx_128_fma_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_128_fma_single
38  * Electrostatics interaction: Coulomb
39  * VdW interaction:            CubicSplineTable
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_128_fma_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
64     real             rcutoff_scalar;
65     real             *shiftvec,*fshift,*x,*f;
66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
67     real             scratch[4*DIM];
68     __m128           fscal,rcutoff,rcutoff2,jidxall;
69     int              vdwioffset0;
70     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
71     int              vdwioffset1;
72     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
73     int              vdwioffset2;
74     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
91     real             *charge;
92     int              nvdwtype;
93     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
94     int              *vdwtype;
95     real             *vdwparam;
96     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
97     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
98     __m128i          vfitab;
99     __m128i          ifour       = _mm_set1_epi32(4);
100     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
101     real             *vftab;
102     __m128           dummy_mask,cutoff_mask;
103     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
104     __m128           one     = _mm_set1_ps(1.0);
105     __m128           two     = _mm_set1_ps(2.0);
106     x                = xx[0];
107     f                = ff[0];
108
109     nri              = nlist->nri;
110     iinr             = nlist->iinr;
111     jindex           = nlist->jindex;
112     jjnr             = nlist->jjnr;
113     shiftidx         = nlist->shift;
114     gid              = nlist->gid;
115     shiftvec         = fr->shift_vec[0];
116     fshift           = fr->fshift[0];
117     facel            = _mm_set1_ps(fr->epsfac);
118     charge           = mdatoms->chargeA;
119     nvdwtype         = fr->ntype;
120     vdwparam         = fr->nbfp;
121     vdwtype          = mdatoms->typeA;
122
123     vftab            = kernel_data->table_vdw->data;
124     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
125
126     /* Setup water-specific parameters */
127     inr              = nlist->iinr[0];
128     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
129     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
130     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
131     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
132
133     jq0              = _mm_set1_ps(charge[inr+0]);
134     jq1              = _mm_set1_ps(charge[inr+1]);
135     jq2              = _mm_set1_ps(charge[inr+2]);
136     vdwjidx0A        = 2*vdwtype[inr+0];
137     qq00             = _mm_mul_ps(iq0,jq0);
138     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
139     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
140     qq01             = _mm_mul_ps(iq0,jq1);
141     qq02             = _mm_mul_ps(iq0,jq2);
142     qq10             = _mm_mul_ps(iq1,jq0);
143     qq11             = _mm_mul_ps(iq1,jq1);
144     qq12             = _mm_mul_ps(iq1,jq2);
145     qq20             = _mm_mul_ps(iq2,jq0);
146     qq21             = _mm_mul_ps(iq2,jq1);
147     qq22             = _mm_mul_ps(iq2,jq2);
148
149     /* Avoid stupid compiler warnings */
150     jnrA = jnrB = jnrC = jnrD = 0;
151     j_coord_offsetA = 0;
152     j_coord_offsetB = 0;
153     j_coord_offsetC = 0;
154     j_coord_offsetD = 0;
155
156     outeriter        = 0;
157     inneriter        = 0;
158
159     for(iidx=0;iidx<4*DIM;iidx++)
160     {
161         scratch[iidx] = 0.0;
162     }
163
164     /* Start outer loop over neighborlists */
165     for(iidx=0; iidx<nri; iidx++)
166     {
167         /* Load shift vector for this list */
168         i_shift_offset   = DIM*shiftidx[iidx];
169
170         /* Load limits for loop over neighbors */
171         j_index_start    = jindex[iidx];
172         j_index_end      = jindex[iidx+1];
173
174         /* Get outer coordinate index */
175         inr              = iinr[iidx];
176         i_coord_offset   = DIM*inr;
177
178         /* Load i particle coords and add shift vector */
179         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
180                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
181
182         fix0             = _mm_setzero_ps();
183         fiy0             = _mm_setzero_ps();
184         fiz0             = _mm_setzero_ps();
185         fix1             = _mm_setzero_ps();
186         fiy1             = _mm_setzero_ps();
187         fiz1             = _mm_setzero_ps();
188         fix2             = _mm_setzero_ps();
189         fiy2             = _mm_setzero_ps();
190         fiz2             = _mm_setzero_ps();
191
192         /* Reset potential sums */
193         velecsum         = _mm_setzero_ps();
194         vvdwsum          = _mm_setzero_ps();
195
196         /* Start inner kernel loop */
197         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
198         {
199
200             /* Get j neighbor index, and coordinate index */
201             jnrA             = jjnr[jidx];
202             jnrB             = jjnr[jidx+1];
203             jnrC             = jjnr[jidx+2];
204             jnrD             = jjnr[jidx+3];
205             j_coord_offsetA  = DIM*jnrA;
206             j_coord_offsetB  = DIM*jnrB;
207             j_coord_offsetC  = DIM*jnrC;
208             j_coord_offsetD  = DIM*jnrD;
209
210             /* load j atom coordinates */
211             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
212                                               x+j_coord_offsetC,x+j_coord_offsetD,
213                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
214
215             /* Calculate displacement vector */
216             dx00             = _mm_sub_ps(ix0,jx0);
217             dy00             = _mm_sub_ps(iy0,jy0);
218             dz00             = _mm_sub_ps(iz0,jz0);
219             dx01             = _mm_sub_ps(ix0,jx1);
220             dy01             = _mm_sub_ps(iy0,jy1);
221             dz01             = _mm_sub_ps(iz0,jz1);
222             dx02             = _mm_sub_ps(ix0,jx2);
223             dy02             = _mm_sub_ps(iy0,jy2);
224             dz02             = _mm_sub_ps(iz0,jz2);
225             dx10             = _mm_sub_ps(ix1,jx0);
226             dy10             = _mm_sub_ps(iy1,jy0);
227             dz10             = _mm_sub_ps(iz1,jz0);
228             dx11             = _mm_sub_ps(ix1,jx1);
229             dy11             = _mm_sub_ps(iy1,jy1);
230             dz11             = _mm_sub_ps(iz1,jz1);
231             dx12             = _mm_sub_ps(ix1,jx2);
232             dy12             = _mm_sub_ps(iy1,jy2);
233             dz12             = _mm_sub_ps(iz1,jz2);
234             dx20             = _mm_sub_ps(ix2,jx0);
235             dy20             = _mm_sub_ps(iy2,jy0);
236             dz20             = _mm_sub_ps(iz2,jz0);
237             dx21             = _mm_sub_ps(ix2,jx1);
238             dy21             = _mm_sub_ps(iy2,jy1);
239             dz21             = _mm_sub_ps(iz2,jz1);
240             dx22             = _mm_sub_ps(ix2,jx2);
241             dy22             = _mm_sub_ps(iy2,jy2);
242             dz22             = _mm_sub_ps(iz2,jz2);
243
244             /* Calculate squared distance and things based on it */
245             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
246             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
247             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
248             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
249             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
250             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
251             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
252             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
253             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
254
255             rinv00           = gmx_mm_invsqrt_ps(rsq00);
256             rinv01           = gmx_mm_invsqrt_ps(rsq01);
257             rinv02           = gmx_mm_invsqrt_ps(rsq02);
258             rinv10           = gmx_mm_invsqrt_ps(rsq10);
259             rinv11           = gmx_mm_invsqrt_ps(rsq11);
260             rinv12           = gmx_mm_invsqrt_ps(rsq12);
261             rinv20           = gmx_mm_invsqrt_ps(rsq20);
262             rinv21           = gmx_mm_invsqrt_ps(rsq21);
263             rinv22           = gmx_mm_invsqrt_ps(rsq22);
264
265             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
266             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
267             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
268             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
269             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
270             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
271             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
272             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
273             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
274
275             fjx0             = _mm_setzero_ps();
276             fjy0             = _mm_setzero_ps();
277             fjz0             = _mm_setzero_ps();
278             fjx1             = _mm_setzero_ps();
279             fjy1             = _mm_setzero_ps();
280             fjz1             = _mm_setzero_ps();
281             fjx2             = _mm_setzero_ps();
282             fjy2             = _mm_setzero_ps();
283             fjz2             = _mm_setzero_ps();
284
285             /**************************
286              * CALCULATE INTERACTIONS *
287              **************************/
288
289             r00              = _mm_mul_ps(rsq00,rinv00);
290
291             /* Calculate table index by multiplying r with table scale and truncate to integer */
292             rt               = _mm_mul_ps(r00,vftabscale);
293             vfitab           = _mm_cvttps_epi32(rt);
294 #ifdef __XOP__
295             vfeps            = _mm_frcz_ps(rt);
296 #else
297             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
298 #endif
299             twovfeps         = _mm_add_ps(vfeps,vfeps);
300             vfitab           = _mm_slli_epi32(vfitab,3);
301
302             /* COULOMB ELECTROSTATICS */
303             velec            = _mm_mul_ps(qq00,rinv00);
304             felec            = _mm_mul_ps(velec,rinvsq00);
305
306             /* CUBIC SPLINE TABLE DISPERSION */
307             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
308             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
309             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
310             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
311             _MM_TRANSPOSE4_PS(Y,F,G,H);
312             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
313             VV               = _mm_macc_ps(vfeps,Fp,Y);
314             vvdw6            = _mm_mul_ps(c6_00,VV);
315             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
316             fvdw6            = _mm_mul_ps(c6_00,FF);
317
318             /* CUBIC SPLINE TABLE REPULSION */
319             vfitab           = _mm_add_epi32(vfitab,ifour);
320             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
321             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
322             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
323             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
324             _MM_TRANSPOSE4_PS(Y,F,G,H);
325             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
326             VV               = _mm_macc_ps(vfeps,Fp,Y);
327             vvdw12           = _mm_mul_ps(c12_00,VV);
328             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
329             fvdw12           = _mm_mul_ps(c12_00,FF);
330             vvdw             = _mm_add_ps(vvdw12,vvdw6);
331             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
332
333             /* Update potential sum for this i atom from the interaction with this j atom. */
334             velecsum         = _mm_add_ps(velecsum,velec);
335             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
336
337             fscal            = _mm_add_ps(felec,fvdw);
338
339              /* Update vectorial force */
340             fix0             = _mm_macc_ps(dx00,fscal,fix0);
341             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
342             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
343
344             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
345             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
346             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
347
348             /**************************
349              * CALCULATE INTERACTIONS *
350              **************************/
351
352             /* COULOMB ELECTROSTATICS */
353             velec            = _mm_mul_ps(qq01,rinv01);
354             felec            = _mm_mul_ps(velec,rinvsq01);
355
356             /* Update potential sum for this i atom from the interaction with this j atom. */
357             velecsum         = _mm_add_ps(velecsum,velec);
358
359             fscal            = felec;
360
361              /* Update vectorial force */
362             fix0             = _mm_macc_ps(dx01,fscal,fix0);
363             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
364             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
365
366             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
367             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
368             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
369
370             /**************************
371              * CALCULATE INTERACTIONS *
372              **************************/
373
374             /* COULOMB ELECTROSTATICS */
375             velec            = _mm_mul_ps(qq02,rinv02);
376             felec            = _mm_mul_ps(velec,rinvsq02);
377
378             /* Update potential sum for this i atom from the interaction with this j atom. */
379             velecsum         = _mm_add_ps(velecsum,velec);
380
381             fscal            = felec;
382
383              /* Update vectorial force */
384             fix0             = _mm_macc_ps(dx02,fscal,fix0);
385             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
386             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
387
388             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
389             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
390             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
391
392             /**************************
393              * CALCULATE INTERACTIONS *
394              **************************/
395
396             /* COULOMB ELECTROSTATICS */
397             velec            = _mm_mul_ps(qq10,rinv10);
398             felec            = _mm_mul_ps(velec,rinvsq10);
399
400             /* Update potential sum for this i atom from the interaction with this j atom. */
401             velecsum         = _mm_add_ps(velecsum,velec);
402
403             fscal            = felec;
404
405              /* Update vectorial force */
406             fix1             = _mm_macc_ps(dx10,fscal,fix1);
407             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
408             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
409
410             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
411             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
412             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
413
414             /**************************
415              * CALCULATE INTERACTIONS *
416              **************************/
417
418             /* COULOMB ELECTROSTATICS */
419             velec            = _mm_mul_ps(qq11,rinv11);
420             felec            = _mm_mul_ps(velec,rinvsq11);
421
422             /* Update potential sum for this i atom from the interaction with this j atom. */
423             velecsum         = _mm_add_ps(velecsum,velec);
424
425             fscal            = felec;
426
427              /* Update vectorial force */
428             fix1             = _mm_macc_ps(dx11,fscal,fix1);
429             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
430             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
431
432             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
433             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
434             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
435
436             /**************************
437              * CALCULATE INTERACTIONS *
438              **************************/
439
440             /* COULOMB ELECTROSTATICS */
441             velec            = _mm_mul_ps(qq12,rinv12);
442             felec            = _mm_mul_ps(velec,rinvsq12);
443
444             /* Update potential sum for this i atom from the interaction with this j atom. */
445             velecsum         = _mm_add_ps(velecsum,velec);
446
447             fscal            = felec;
448
449              /* Update vectorial force */
450             fix1             = _mm_macc_ps(dx12,fscal,fix1);
451             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
452             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
453
454             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
455             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
456             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
457
458             /**************************
459              * CALCULATE INTERACTIONS *
460              **************************/
461
462             /* COULOMB ELECTROSTATICS */
463             velec            = _mm_mul_ps(qq20,rinv20);
464             felec            = _mm_mul_ps(velec,rinvsq20);
465
466             /* Update potential sum for this i atom from the interaction with this j atom. */
467             velecsum         = _mm_add_ps(velecsum,velec);
468
469             fscal            = felec;
470
471              /* Update vectorial force */
472             fix2             = _mm_macc_ps(dx20,fscal,fix2);
473             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
474             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
475
476             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
477             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
478             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
479
480             /**************************
481              * CALCULATE INTERACTIONS *
482              **************************/
483
484             /* COULOMB ELECTROSTATICS */
485             velec            = _mm_mul_ps(qq21,rinv21);
486             felec            = _mm_mul_ps(velec,rinvsq21);
487
488             /* Update potential sum for this i atom from the interaction with this j atom. */
489             velecsum         = _mm_add_ps(velecsum,velec);
490
491             fscal            = felec;
492
493              /* Update vectorial force */
494             fix2             = _mm_macc_ps(dx21,fscal,fix2);
495             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
496             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
497
498             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
499             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
500             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
501
502             /**************************
503              * CALCULATE INTERACTIONS *
504              **************************/
505
506             /* COULOMB ELECTROSTATICS */
507             velec            = _mm_mul_ps(qq22,rinv22);
508             felec            = _mm_mul_ps(velec,rinvsq22);
509
510             /* Update potential sum for this i atom from the interaction with this j atom. */
511             velecsum         = _mm_add_ps(velecsum,velec);
512
513             fscal            = felec;
514
515              /* Update vectorial force */
516             fix2             = _mm_macc_ps(dx22,fscal,fix2);
517             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
518             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
519
520             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
521             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
522             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
523
524             fjptrA             = f+j_coord_offsetA;
525             fjptrB             = f+j_coord_offsetB;
526             fjptrC             = f+j_coord_offsetC;
527             fjptrD             = f+j_coord_offsetD;
528
529             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
530                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
531
532             /* Inner loop uses 314 flops */
533         }
534
535         if(jidx<j_index_end)
536         {
537
538             /* Get j neighbor index, and coordinate index */
539             jnrlistA         = jjnr[jidx];
540             jnrlistB         = jjnr[jidx+1];
541             jnrlistC         = jjnr[jidx+2];
542             jnrlistD         = jjnr[jidx+3];
543             /* Sign of each element will be negative for non-real atoms.
544              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
545              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
546              */
547             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
548             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
549             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
550             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
551             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
552             j_coord_offsetA  = DIM*jnrA;
553             j_coord_offsetB  = DIM*jnrB;
554             j_coord_offsetC  = DIM*jnrC;
555             j_coord_offsetD  = DIM*jnrD;
556
557             /* load j atom coordinates */
558             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
559                                               x+j_coord_offsetC,x+j_coord_offsetD,
560                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
561
562             /* Calculate displacement vector */
563             dx00             = _mm_sub_ps(ix0,jx0);
564             dy00             = _mm_sub_ps(iy0,jy0);
565             dz00             = _mm_sub_ps(iz0,jz0);
566             dx01             = _mm_sub_ps(ix0,jx1);
567             dy01             = _mm_sub_ps(iy0,jy1);
568             dz01             = _mm_sub_ps(iz0,jz1);
569             dx02             = _mm_sub_ps(ix0,jx2);
570             dy02             = _mm_sub_ps(iy0,jy2);
571             dz02             = _mm_sub_ps(iz0,jz2);
572             dx10             = _mm_sub_ps(ix1,jx0);
573             dy10             = _mm_sub_ps(iy1,jy0);
574             dz10             = _mm_sub_ps(iz1,jz0);
575             dx11             = _mm_sub_ps(ix1,jx1);
576             dy11             = _mm_sub_ps(iy1,jy1);
577             dz11             = _mm_sub_ps(iz1,jz1);
578             dx12             = _mm_sub_ps(ix1,jx2);
579             dy12             = _mm_sub_ps(iy1,jy2);
580             dz12             = _mm_sub_ps(iz1,jz2);
581             dx20             = _mm_sub_ps(ix2,jx0);
582             dy20             = _mm_sub_ps(iy2,jy0);
583             dz20             = _mm_sub_ps(iz2,jz0);
584             dx21             = _mm_sub_ps(ix2,jx1);
585             dy21             = _mm_sub_ps(iy2,jy1);
586             dz21             = _mm_sub_ps(iz2,jz1);
587             dx22             = _mm_sub_ps(ix2,jx2);
588             dy22             = _mm_sub_ps(iy2,jy2);
589             dz22             = _mm_sub_ps(iz2,jz2);
590
591             /* Calculate squared distance and things based on it */
592             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
593             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
594             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
595             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
596             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
597             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
598             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
599             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
600             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
601
602             rinv00           = gmx_mm_invsqrt_ps(rsq00);
603             rinv01           = gmx_mm_invsqrt_ps(rsq01);
604             rinv02           = gmx_mm_invsqrt_ps(rsq02);
605             rinv10           = gmx_mm_invsqrt_ps(rsq10);
606             rinv11           = gmx_mm_invsqrt_ps(rsq11);
607             rinv12           = gmx_mm_invsqrt_ps(rsq12);
608             rinv20           = gmx_mm_invsqrt_ps(rsq20);
609             rinv21           = gmx_mm_invsqrt_ps(rsq21);
610             rinv22           = gmx_mm_invsqrt_ps(rsq22);
611
612             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
613             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
614             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
615             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
616             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
617             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
618             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
619             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
620             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
621
622             fjx0             = _mm_setzero_ps();
623             fjy0             = _mm_setzero_ps();
624             fjz0             = _mm_setzero_ps();
625             fjx1             = _mm_setzero_ps();
626             fjy1             = _mm_setzero_ps();
627             fjz1             = _mm_setzero_ps();
628             fjx2             = _mm_setzero_ps();
629             fjy2             = _mm_setzero_ps();
630             fjz2             = _mm_setzero_ps();
631
632             /**************************
633              * CALCULATE INTERACTIONS *
634              **************************/
635
636             r00              = _mm_mul_ps(rsq00,rinv00);
637             r00              = _mm_andnot_ps(dummy_mask,r00);
638
639             /* Calculate table index by multiplying r with table scale and truncate to integer */
640             rt               = _mm_mul_ps(r00,vftabscale);
641             vfitab           = _mm_cvttps_epi32(rt);
642 #ifdef __XOP__
643             vfeps            = _mm_frcz_ps(rt);
644 #else
645             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
646 #endif
647             twovfeps         = _mm_add_ps(vfeps,vfeps);
648             vfitab           = _mm_slli_epi32(vfitab,3);
649
650             /* COULOMB ELECTROSTATICS */
651             velec            = _mm_mul_ps(qq00,rinv00);
652             felec            = _mm_mul_ps(velec,rinvsq00);
653
654             /* CUBIC SPLINE TABLE DISPERSION */
655             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
656             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
657             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
658             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
659             _MM_TRANSPOSE4_PS(Y,F,G,H);
660             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
661             VV               = _mm_macc_ps(vfeps,Fp,Y);
662             vvdw6            = _mm_mul_ps(c6_00,VV);
663             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
664             fvdw6            = _mm_mul_ps(c6_00,FF);
665
666             /* CUBIC SPLINE TABLE REPULSION */
667             vfitab           = _mm_add_epi32(vfitab,ifour);
668             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
669             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
670             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
671             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
672             _MM_TRANSPOSE4_PS(Y,F,G,H);
673             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
674             VV               = _mm_macc_ps(vfeps,Fp,Y);
675             vvdw12           = _mm_mul_ps(c12_00,VV);
676             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
677             fvdw12           = _mm_mul_ps(c12_00,FF);
678             vvdw             = _mm_add_ps(vvdw12,vvdw6);
679             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
680
681             /* Update potential sum for this i atom from the interaction with this j atom. */
682             velec            = _mm_andnot_ps(dummy_mask,velec);
683             velecsum         = _mm_add_ps(velecsum,velec);
684             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
685             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
686
687             fscal            = _mm_add_ps(felec,fvdw);
688
689             fscal            = _mm_andnot_ps(dummy_mask,fscal);
690
691              /* Update vectorial force */
692             fix0             = _mm_macc_ps(dx00,fscal,fix0);
693             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
694             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
695
696             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
697             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
698             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
699
700             /**************************
701              * CALCULATE INTERACTIONS *
702              **************************/
703
704             /* COULOMB ELECTROSTATICS */
705             velec            = _mm_mul_ps(qq01,rinv01);
706             felec            = _mm_mul_ps(velec,rinvsq01);
707
708             /* Update potential sum for this i atom from the interaction with this j atom. */
709             velec            = _mm_andnot_ps(dummy_mask,velec);
710             velecsum         = _mm_add_ps(velecsum,velec);
711
712             fscal            = felec;
713
714             fscal            = _mm_andnot_ps(dummy_mask,fscal);
715
716              /* Update vectorial force */
717             fix0             = _mm_macc_ps(dx01,fscal,fix0);
718             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
719             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
720
721             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
722             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
723             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
724
725             /**************************
726              * CALCULATE INTERACTIONS *
727              **************************/
728
729             /* COULOMB ELECTROSTATICS */
730             velec            = _mm_mul_ps(qq02,rinv02);
731             felec            = _mm_mul_ps(velec,rinvsq02);
732
733             /* Update potential sum for this i atom from the interaction with this j atom. */
734             velec            = _mm_andnot_ps(dummy_mask,velec);
735             velecsum         = _mm_add_ps(velecsum,velec);
736
737             fscal            = felec;
738
739             fscal            = _mm_andnot_ps(dummy_mask,fscal);
740
741              /* Update vectorial force */
742             fix0             = _mm_macc_ps(dx02,fscal,fix0);
743             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
744             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
745
746             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
747             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
748             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
749
750             /**************************
751              * CALCULATE INTERACTIONS *
752              **************************/
753
754             /* COULOMB ELECTROSTATICS */
755             velec            = _mm_mul_ps(qq10,rinv10);
756             felec            = _mm_mul_ps(velec,rinvsq10);
757
758             /* Update potential sum for this i atom from the interaction with this j atom. */
759             velec            = _mm_andnot_ps(dummy_mask,velec);
760             velecsum         = _mm_add_ps(velecsum,velec);
761
762             fscal            = felec;
763
764             fscal            = _mm_andnot_ps(dummy_mask,fscal);
765
766              /* Update vectorial force */
767             fix1             = _mm_macc_ps(dx10,fscal,fix1);
768             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
769             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
770
771             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
772             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
773             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
774
775             /**************************
776              * CALCULATE INTERACTIONS *
777              **************************/
778
779             /* COULOMB ELECTROSTATICS */
780             velec            = _mm_mul_ps(qq11,rinv11);
781             felec            = _mm_mul_ps(velec,rinvsq11);
782
783             /* Update potential sum for this i atom from the interaction with this j atom. */
784             velec            = _mm_andnot_ps(dummy_mask,velec);
785             velecsum         = _mm_add_ps(velecsum,velec);
786
787             fscal            = felec;
788
789             fscal            = _mm_andnot_ps(dummy_mask,fscal);
790
791              /* Update vectorial force */
792             fix1             = _mm_macc_ps(dx11,fscal,fix1);
793             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
794             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
795
796             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
797             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
798             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
799
800             /**************************
801              * CALCULATE INTERACTIONS *
802              **************************/
803
804             /* COULOMB ELECTROSTATICS */
805             velec            = _mm_mul_ps(qq12,rinv12);
806             felec            = _mm_mul_ps(velec,rinvsq12);
807
808             /* Update potential sum for this i atom from the interaction with this j atom. */
809             velec            = _mm_andnot_ps(dummy_mask,velec);
810             velecsum         = _mm_add_ps(velecsum,velec);
811
812             fscal            = felec;
813
814             fscal            = _mm_andnot_ps(dummy_mask,fscal);
815
816              /* Update vectorial force */
817             fix1             = _mm_macc_ps(dx12,fscal,fix1);
818             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
819             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
820
821             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
822             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
823             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
824
825             /**************************
826              * CALCULATE INTERACTIONS *
827              **************************/
828
829             /* COULOMB ELECTROSTATICS */
830             velec            = _mm_mul_ps(qq20,rinv20);
831             felec            = _mm_mul_ps(velec,rinvsq20);
832
833             /* Update potential sum for this i atom from the interaction with this j atom. */
834             velec            = _mm_andnot_ps(dummy_mask,velec);
835             velecsum         = _mm_add_ps(velecsum,velec);
836
837             fscal            = felec;
838
839             fscal            = _mm_andnot_ps(dummy_mask,fscal);
840
841              /* Update vectorial force */
842             fix2             = _mm_macc_ps(dx20,fscal,fix2);
843             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
844             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
845
846             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
847             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
848             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
849
850             /**************************
851              * CALCULATE INTERACTIONS *
852              **************************/
853
854             /* COULOMB ELECTROSTATICS */
855             velec            = _mm_mul_ps(qq21,rinv21);
856             felec            = _mm_mul_ps(velec,rinvsq21);
857
858             /* Update potential sum for this i atom from the interaction with this j atom. */
859             velec            = _mm_andnot_ps(dummy_mask,velec);
860             velecsum         = _mm_add_ps(velecsum,velec);
861
862             fscal            = felec;
863
864             fscal            = _mm_andnot_ps(dummy_mask,fscal);
865
866              /* Update vectorial force */
867             fix2             = _mm_macc_ps(dx21,fscal,fix2);
868             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
869             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
870
871             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
872             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
873             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
874
875             /**************************
876              * CALCULATE INTERACTIONS *
877              **************************/
878
879             /* COULOMB ELECTROSTATICS */
880             velec            = _mm_mul_ps(qq22,rinv22);
881             felec            = _mm_mul_ps(velec,rinvsq22);
882
883             /* Update potential sum for this i atom from the interaction with this j atom. */
884             velec            = _mm_andnot_ps(dummy_mask,velec);
885             velecsum         = _mm_add_ps(velecsum,velec);
886
887             fscal            = felec;
888
889             fscal            = _mm_andnot_ps(dummy_mask,fscal);
890
891              /* Update vectorial force */
892             fix2             = _mm_macc_ps(dx22,fscal,fix2);
893             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
894             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
895
896             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
897             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
898             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
899
900             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
901             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
902             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
903             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
904
905             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
906                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
907
908             /* Inner loop uses 315 flops */
909         }
910
911         /* End of innermost loop */
912
913         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
914                                               f+i_coord_offset,fshift+i_shift_offset);
915
916         ggid                        = gid[iidx];
917         /* Update potential energies */
918         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
919         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
920
921         /* Increment number of inner iterations */
922         inneriter                  += j_index_end - j_index_start;
923
924         /* Outer loop uses 20 flops */
925     }
926
927     /* Increment number of outer iterations */
928     outeriter        += nri;
929
930     /* Update outer/inner flops */
931
932     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*315);
933 }
934 /*
935  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_128_fma_single
936  * Electrostatics interaction: Coulomb
937  * VdW interaction:            CubicSplineTable
938  * Geometry:                   Water3-Water3
939  * Calculate force/pot:        Force
940  */
941 void
942 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_128_fma_single
943                     (t_nblist * gmx_restrict                nlist,
944                      rvec * gmx_restrict                    xx,
945                      rvec * gmx_restrict                    ff,
946                      t_forcerec * gmx_restrict              fr,
947                      t_mdatoms * gmx_restrict               mdatoms,
948                      nb_kernel_data_t * gmx_restrict        kernel_data,
949                      t_nrnb * gmx_restrict                  nrnb)
950 {
951     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
952      * just 0 for non-waters.
953      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
954      * jnr indices corresponding to data put in the four positions in the SIMD register.
955      */
956     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
957     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
958     int              jnrA,jnrB,jnrC,jnrD;
959     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
960     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
961     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
962     real             rcutoff_scalar;
963     real             *shiftvec,*fshift,*x,*f;
964     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
965     real             scratch[4*DIM];
966     __m128           fscal,rcutoff,rcutoff2,jidxall;
967     int              vdwioffset0;
968     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
969     int              vdwioffset1;
970     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
971     int              vdwioffset2;
972     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
973     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
974     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
975     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
976     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
977     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
978     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
979     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
980     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
981     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
982     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
983     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
984     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
985     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
986     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
987     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
988     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
989     real             *charge;
990     int              nvdwtype;
991     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
992     int              *vdwtype;
993     real             *vdwparam;
994     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
995     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
996     __m128i          vfitab;
997     __m128i          ifour       = _mm_set1_epi32(4);
998     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
999     real             *vftab;
1000     __m128           dummy_mask,cutoff_mask;
1001     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1002     __m128           one     = _mm_set1_ps(1.0);
1003     __m128           two     = _mm_set1_ps(2.0);
1004     x                = xx[0];
1005     f                = ff[0];
1006
1007     nri              = nlist->nri;
1008     iinr             = nlist->iinr;
1009     jindex           = nlist->jindex;
1010     jjnr             = nlist->jjnr;
1011     shiftidx         = nlist->shift;
1012     gid              = nlist->gid;
1013     shiftvec         = fr->shift_vec[0];
1014     fshift           = fr->fshift[0];
1015     facel            = _mm_set1_ps(fr->epsfac);
1016     charge           = mdatoms->chargeA;
1017     nvdwtype         = fr->ntype;
1018     vdwparam         = fr->nbfp;
1019     vdwtype          = mdatoms->typeA;
1020
1021     vftab            = kernel_data->table_vdw->data;
1022     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
1023
1024     /* Setup water-specific parameters */
1025     inr              = nlist->iinr[0];
1026     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1027     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1028     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1029     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1030
1031     jq0              = _mm_set1_ps(charge[inr+0]);
1032     jq1              = _mm_set1_ps(charge[inr+1]);
1033     jq2              = _mm_set1_ps(charge[inr+2]);
1034     vdwjidx0A        = 2*vdwtype[inr+0];
1035     qq00             = _mm_mul_ps(iq0,jq0);
1036     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1037     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1038     qq01             = _mm_mul_ps(iq0,jq1);
1039     qq02             = _mm_mul_ps(iq0,jq2);
1040     qq10             = _mm_mul_ps(iq1,jq0);
1041     qq11             = _mm_mul_ps(iq1,jq1);
1042     qq12             = _mm_mul_ps(iq1,jq2);
1043     qq20             = _mm_mul_ps(iq2,jq0);
1044     qq21             = _mm_mul_ps(iq2,jq1);
1045     qq22             = _mm_mul_ps(iq2,jq2);
1046
1047     /* Avoid stupid compiler warnings */
1048     jnrA = jnrB = jnrC = jnrD = 0;
1049     j_coord_offsetA = 0;
1050     j_coord_offsetB = 0;
1051     j_coord_offsetC = 0;
1052     j_coord_offsetD = 0;
1053
1054     outeriter        = 0;
1055     inneriter        = 0;
1056
1057     for(iidx=0;iidx<4*DIM;iidx++)
1058     {
1059         scratch[iidx] = 0.0;
1060     }
1061
1062     /* Start outer loop over neighborlists */
1063     for(iidx=0; iidx<nri; iidx++)
1064     {
1065         /* Load shift vector for this list */
1066         i_shift_offset   = DIM*shiftidx[iidx];
1067
1068         /* Load limits for loop over neighbors */
1069         j_index_start    = jindex[iidx];
1070         j_index_end      = jindex[iidx+1];
1071
1072         /* Get outer coordinate index */
1073         inr              = iinr[iidx];
1074         i_coord_offset   = DIM*inr;
1075
1076         /* Load i particle coords and add shift vector */
1077         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1078                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1079
1080         fix0             = _mm_setzero_ps();
1081         fiy0             = _mm_setzero_ps();
1082         fiz0             = _mm_setzero_ps();
1083         fix1             = _mm_setzero_ps();
1084         fiy1             = _mm_setzero_ps();
1085         fiz1             = _mm_setzero_ps();
1086         fix2             = _mm_setzero_ps();
1087         fiy2             = _mm_setzero_ps();
1088         fiz2             = _mm_setzero_ps();
1089
1090         /* Start inner kernel loop */
1091         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1092         {
1093
1094             /* Get j neighbor index, and coordinate index */
1095             jnrA             = jjnr[jidx];
1096             jnrB             = jjnr[jidx+1];
1097             jnrC             = jjnr[jidx+2];
1098             jnrD             = jjnr[jidx+3];
1099             j_coord_offsetA  = DIM*jnrA;
1100             j_coord_offsetB  = DIM*jnrB;
1101             j_coord_offsetC  = DIM*jnrC;
1102             j_coord_offsetD  = DIM*jnrD;
1103
1104             /* load j atom coordinates */
1105             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1106                                               x+j_coord_offsetC,x+j_coord_offsetD,
1107                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1108
1109             /* Calculate displacement vector */
1110             dx00             = _mm_sub_ps(ix0,jx0);
1111             dy00             = _mm_sub_ps(iy0,jy0);
1112             dz00             = _mm_sub_ps(iz0,jz0);
1113             dx01             = _mm_sub_ps(ix0,jx1);
1114             dy01             = _mm_sub_ps(iy0,jy1);
1115             dz01             = _mm_sub_ps(iz0,jz1);
1116             dx02             = _mm_sub_ps(ix0,jx2);
1117             dy02             = _mm_sub_ps(iy0,jy2);
1118             dz02             = _mm_sub_ps(iz0,jz2);
1119             dx10             = _mm_sub_ps(ix1,jx0);
1120             dy10             = _mm_sub_ps(iy1,jy0);
1121             dz10             = _mm_sub_ps(iz1,jz0);
1122             dx11             = _mm_sub_ps(ix1,jx1);
1123             dy11             = _mm_sub_ps(iy1,jy1);
1124             dz11             = _mm_sub_ps(iz1,jz1);
1125             dx12             = _mm_sub_ps(ix1,jx2);
1126             dy12             = _mm_sub_ps(iy1,jy2);
1127             dz12             = _mm_sub_ps(iz1,jz2);
1128             dx20             = _mm_sub_ps(ix2,jx0);
1129             dy20             = _mm_sub_ps(iy2,jy0);
1130             dz20             = _mm_sub_ps(iz2,jz0);
1131             dx21             = _mm_sub_ps(ix2,jx1);
1132             dy21             = _mm_sub_ps(iy2,jy1);
1133             dz21             = _mm_sub_ps(iz2,jz1);
1134             dx22             = _mm_sub_ps(ix2,jx2);
1135             dy22             = _mm_sub_ps(iy2,jy2);
1136             dz22             = _mm_sub_ps(iz2,jz2);
1137
1138             /* Calculate squared distance and things based on it */
1139             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1140             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1141             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1142             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1143             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1144             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1145             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1146             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1147             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1148
1149             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1150             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1151             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1152             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1153             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1154             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1155             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1156             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1157             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1158
1159             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1160             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1161             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1162             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1163             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1164             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1165             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1166             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1167             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1168
1169             fjx0             = _mm_setzero_ps();
1170             fjy0             = _mm_setzero_ps();
1171             fjz0             = _mm_setzero_ps();
1172             fjx1             = _mm_setzero_ps();
1173             fjy1             = _mm_setzero_ps();
1174             fjz1             = _mm_setzero_ps();
1175             fjx2             = _mm_setzero_ps();
1176             fjy2             = _mm_setzero_ps();
1177             fjz2             = _mm_setzero_ps();
1178
1179             /**************************
1180              * CALCULATE INTERACTIONS *
1181              **************************/
1182
1183             r00              = _mm_mul_ps(rsq00,rinv00);
1184
1185             /* Calculate table index by multiplying r with table scale and truncate to integer */
1186             rt               = _mm_mul_ps(r00,vftabscale);
1187             vfitab           = _mm_cvttps_epi32(rt);
1188 #ifdef __XOP__
1189             vfeps            = _mm_frcz_ps(rt);
1190 #else
1191             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1192 #endif
1193             twovfeps         = _mm_add_ps(vfeps,vfeps);
1194             vfitab           = _mm_slli_epi32(vfitab,3);
1195
1196             /* COULOMB ELECTROSTATICS */
1197             velec            = _mm_mul_ps(qq00,rinv00);
1198             felec            = _mm_mul_ps(velec,rinvsq00);
1199
1200             /* CUBIC SPLINE TABLE DISPERSION */
1201             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1202             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1203             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1204             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1205             _MM_TRANSPOSE4_PS(Y,F,G,H);
1206             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1207             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1208             fvdw6            = _mm_mul_ps(c6_00,FF);
1209
1210             /* CUBIC SPLINE TABLE REPULSION */
1211             vfitab           = _mm_add_epi32(vfitab,ifour);
1212             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1213             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1214             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1215             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1216             _MM_TRANSPOSE4_PS(Y,F,G,H);
1217             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1218             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1219             fvdw12           = _mm_mul_ps(c12_00,FF);
1220             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1221
1222             fscal            = _mm_add_ps(felec,fvdw);
1223
1224              /* Update vectorial force */
1225             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1226             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1227             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1228
1229             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1230             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1231             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1232
1233             /**************************
1234              * CALCULATE INTERACTIONS *
1235              **************************/
1236
1237             /* COULOMB ELECTROSTATICS */
1238             velec            = _mm_mul_ps(qq01,rinv01);
1239             felec            = _mm_mul_ps(velec,rinvsq01);
1240
1241             fscal            = felec;
1242
1243              /* Update vectorial force */
1244             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1245             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1246             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1247
1248             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1249             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1250             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1251
1252             /**************************
1253              * CALCULATE INTERACTIONS *
1254              **************************/
1255
1256             /* COULOMB ELECTROSTATICS */
1257             velec            = _mm_mul_ps(qq02,rinv02);
1258             felec            = _mm_mul_ps(velec,rinvsq02);
1259
1260             fscal            = felec;
1261
1262              /* Update vectorial force */
1263             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1264             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1265             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1266
1267             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1268             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1269             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1270
1271             /**************************
1272              * CALCULATE INTERACTIONS *
1273              **************************/
1274
1275             /* COULOMB ELECTROSTATICS */
1276             velec            = _mm_mul_ps(qq10,rinv10);
1277             felec            = _mm_mul_ps(velec,rinvsq10);
1278
1279             fscal            = felec;
1280
1281              /* Update vectorial force */
1282             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1283             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1284             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1285
1286             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1287             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1288             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1289
1290             /**************************
1291              * CALCULATE INTERACTIONS *
1292              **************************/
1293
1294             /* COULOMB ELECTROSTATICS */
1295             velec            = _mm_mul_ps(qq11,rinv11);
1296             felec            = _mm_mul_ps(velec,rinvsq11);
1297
1298             fscal            = felec;
1299
1300              /* Update vectorial force */
1301             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1302             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1303             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1304
1305             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1306             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1307             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1308
1309             /**************************
1310              * CALCULATE INTERACTIONS *
1311              **************************/
1312
1313             /* COULOMB ELECTROSTATICS */
1314             velec            = _mm_mul_ps(qq12,rinv12);
1315             felec            = _mm_mul_ps(velec,rinvsq12);
1316
1317             fscal            = felec;
1318
1319              /* Update vectorial force */
1320             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1321             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1322             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1323
1324             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1325             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1326             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1327
1328             /**************************
1329              * CALCULATE INTERACTIONS *
1330              **************************/
1331
1332             /* COULOMB ELECTROSTATICS */
1333             velec            = _mm_mul_ps(qq20,rinv20);
1334             felec            = _mm_mul_ps(velec,rinvsq20);
1335
1336             fscal            = felec;
1337
1338              /* Update vectorial force */
1339             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1340             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1341             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1342
1343             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1344             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1345             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1346
1347             /**************************
1348              * CALCULATE INTERACTIONS *
1349              **************************/
1350
1351             /* COULOMB ELECTROSTATICS */
1352             velec            = _mm_mul_ps(qq21,rinv21);
1353             felec            = _mm_mul_ps(velec,rinvsq21);
1354
1355             fscal            = felec;
1356
1357              /* Update vectorial force */
1358             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1359             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1360             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1361
1362             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1363             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1364             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1365
1366             /**************************
1367              * CALCULATE INTERACTIONS *
1368              **************************/
1369
1370             /* COULOMB ELECTROSTATICS */
1371             velec            = _mm_mul_ps(qq22,rinv22);
1372             felec            = _mm_mul_ps(velec,rinvsq22);
1373
1374             fscal            = felec;
1375
1376              /* Update vectorial force */
1377             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1378             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1379             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1380
1381             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1382             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1383             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1384
1385             fjptrA             = f+j_coord_offsetA;
1386             fjptrB             = f+j_coord_offsetB;
1387             fjptrC             = f+j_coord_offsetC;
1388             fjptrD             = f+j_coord_offsetD;
1389
1390             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1391                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1392
1393             /* Inner loop uses 297 flops */
1394         }
1395
1396         if(jidx<j_index_end)
1397         {
1398
1399             /* Get j neighbor index, and coordinate index */
1400             jnrlistA         = jjnr[jidx];
1401             jnrlistB         = jjnr[jidx+1];
1402             jnrlistC         = jjnr[jidx+2];
1403             jnrlistD         = jjnr[jidx+3];
1404             /* Sign of each element will be negative for non-real atoms.
1405              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1406              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1407              */
1408             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1409             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1410             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1411             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1412             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1413             j_coord_offsetA  = DIM*jnrA;
1414             j_coord_offsetB  = DIM*jnrB;
1415             j_coord_offsetC  = DIM*jnrC;
1416             j_coord_offsetD  = DIM*jnrD;
1417
1418             /* load j atom coordinates */
1419             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1420                                               x+j_coord_offsetC,x+j_coord_offsetD,
1421                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1422
1423             /* Calculate displacement vector */
1424             dx00             = _mm_sub_ps(ix0,jx0);
1425             dy00             = _mm_sub_ps(iy0,jy0);
1426             dz00             = _mm_sub_ps(iz0,jz0);
1427             dx01             = _mm_sub_ps(ix0,jx1);
1428             dy01             = _mm_sub_ps(iy0,jy1);
1429             dz01             = _mm_sub_ps(iz0,jz1);
1430             dx02             = _mm_sub_ps(ix0,jx2);
1431             dy02             = _mm_sub_ps(iy0,jy2);
1432             dz02             = _mm_sub_ps(iz0,jz2);
1433             dx10             = _mm_sub_ps(ix1,jx0);
1434             dy10             = _mm_sub_ps(iy1,jy0);
1435             dz10             = _mm_sub_ps(iz1,jz0);
1436             dx11             = _mm_sub_ps(ix1,jx1);
1437             dy11             = _mm_sub_ps(iy1,jy1);
1438             dz11             = _mm_sub_ps(iz1,jz1);
1439             dx12             = _mm_sub_ps(ix1,jx2);
1440             dy12             = _mm_sub_ps(iy1,jy2);
1441             dz12             = _mm_sub_ps(iz1,jz2);
1442             dx20             = _mm_sub_ps(ix2,jx0);
1443             dy20             = _mm_sub_ps(iy2,jy0);
1444             dz20             = _mm_sub_ps(iz2,jz0);
1445             dx21             = _mm_sub_ps(ix2,jx1);
1446             dy21             = _mm_sub_ps(iy2,jy1);
1447             dz21             = _mm_sub_ps(iz2,jz1);
1448             dx22             = _mm_sub_ps(ix2,jx2);
1449             dy22             = _mm_sub_ps(iy2,jy2);
1450             dz22             = _mm_sub_ps(iz2,jz2);
1451
1452             /* Calculate squared distance and things based on it */
1453             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1454             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1455             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1456             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1457             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1458             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1459             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1460             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1461             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1462
1463             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1464             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1465             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1466             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1467             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1468             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1469             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1470             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1471             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1472
1473             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1474             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1475             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1476             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1477             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1478             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1479             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1480             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1481             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1482
1483             fjx0             = _mm_setzero_ps();
1484             fjy0             = _mm_setzero_ps();
1485             fjz0             = _mm_setzero_ps();
1486             fjx1             = _mm_setzero_ps();
1487             fjy1             = _mm_setzero_ps();
1488             fjz1             = _mm_setzero_ps();
1489             fjx2             = _mm_setzero_ps();
1490             fjy2             = _mm_setzero_ps();
1491             fjz2             = _mm_setzero_ps();
1492
1493             /**************************
1494              * CALCULATE INTERACTIONS *
1495              **************************/
1496
1497             r00              = _mm_mul_ps(rsq00,rinv00);
1498             r00              = _mm_andnot_ps(dummy_mask,r00);
1499
1500             /* Calculate table index by multiplying r with table scale and truncate to integer */
1501             rt               = _mm_mul_ps(r00,vftabscale);
1502             vfitab           = _mm_cvttps_epi32(rt);
1503 #ifdef __XOP__
1504             vfeps            = _mm_frcz_ps(rt);
1505 #else
1506             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1507 #endif
1508             twovfeps         = _mm_add_ps(vfeps,vfeps);
1509             vfitab           = _mm_slli_epi32(vfitab,3);
1510
1511             /* COULOMB ELECTROSTATICS */
1512             velec            = _mm_mul_ps(qq00,rinv00);
1513             felec            = _mm_mul_ps(velec,rinvsq00);
1514
1515             /* CUBIC SPLINE TABLE DISPERSION */
1516             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1517             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1518             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1519             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1520             _MM_TRANSPOSE4_PS(Y,F,G,H);
1521             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1522             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1523             fvdw6            = _mm_mul_ps(c6_00,FF);
1524
1525             /* CUBIC SPLINE TABLE REPULSION */
1526             vfitab           = _mm_add_epi32(vfitab,ifour);
1527             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1528             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1529             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1530             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1531             _MM_TRANSPOSE4_PS(Y,F,G,H);
1532             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1533             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1534             fvdw12           = _mm_mul_ps(c12_00,FF);
1535             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1536
1537             fscal            = _mm_add_ps(felec,fvdw);
1538
1539             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1540
1541              /* Update vectorial force */
1542             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1543             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1544             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1545
1546             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1547             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1548             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1549
1550             /**************************
1551              * CALCULATE INTERACTIONS *
1552              **************************/
1553
1554             /* COULOMB ELECTROSTATICS */
1555             velec            = _mm_mul_ps(qq01,rinv01);
1556             felec            = _mm_mul_ps(velec,rinvsq01);
1557
1558             fscal            = felec;
1559
1560             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1561
1562              /* Update vectorial force */
1563             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1564             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1565             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1566
1567             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1568             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1569             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1570
1571             /**************************
1572              * CALCULATE INTERACTIONS *
1573              **************************/
1574
1575             /* COULOMB ELECTROSTATICS */
1576             velec            = _mm_mul_ps(qq02,rinv02);
1577             felec            = _mm_mul_ps(velec,rinvsq02);
1578
1579             fscal            = felec;
1580
1581             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1582
1583              /* Update vectorial force */
1584             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1585             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1586             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1587
1588             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1589             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1590             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1591
1592             /**************************
1593              * CALCULATE INTERACTIONS *
1594              **************************/
1595
1596             /* COULOMB ELECTROSTATICS */
1597             velec            = _mm_mul_ps(qq10,rinv10);
1598             felec            = _mm_mul_ps(velec,rinvsq10);
1599
1600             fscal            = felec;
1601
1602             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1603
1604              /* Update vectorial force */
1605             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1606             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1607             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1608
1609             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1610             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1611             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1612
1613             /**************************
1614              * CALCULATE INTERACTIONS *
1615              **************************/
1616
1617             /* COULOMB ELECTROSTATICS */
1618             velec            = _mm_mul_ps(qq11,rinv11);
1619             felec            = _mm_mul_ps(velec,rinvsq11);
1620
1621             fscal            = felec;
1622
1623             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1624
1625              /* Update vectorial force */
1626             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1627             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1628             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1629
1630             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1631             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1632             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1633
1634             /**************************
1635              * CALCULATE INTERACTIONS *
1636              **************************/
1637
1638             /* COULOMB ELECTROSTATICS */
1639             velec            = _mm_mul_ps(qq12,rinv12);
1640             felec            = _mm_mul_ps(velec,rinvsq12);
1641
1642             fscal            = felec;
1643
1644             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1645
1646              /* Update vectorial force */
1647             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1648             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1649             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1650
1651             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1652             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1653             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1654
1655             /**************************
1656              * CALCULATE INTERACTIONS *
1657              **************************/
1658
1659             /* COULOMB ELECTROSTATICS */
1660             velec            = _mm_mul_ps(qq20,rinv20);
1661             felec            = _mm_mul_ps(velec,rinvsq20);
1662
1663             fscal            = felec;
1664
1665             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1666
1667              /* Update vectorial force */
1668             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1669             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1670             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1671
1672             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1673             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1674             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1675
1676             /**************************
1677              * CALCULATE INTERACTIONS *
1678              **************************/
1679
1680             /* COULOMB ELECTROSTATICS */
1681             velec            = _mm_mul_ps(qq21,rinv21);
1682             felec            = _mm_mul_ps(velec,rinvsq21);
1683
1684             fscal            = felec;
1685
1686             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1687
1688              /* Update vectorial force */
1689             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1690             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1691             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1692
1693             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1694             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1695             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1696
1697             /**************************
1698              * CALCULATE INTERACTIONS *
1699              **************************/
1700
1701             /* COULOMB ELECTROSTATICS */
1702             velec            = _mm_mul_ps(qq22,rinv22);
1703             felec            = _mm_mul_ps(velec,rinvsq22);
1704
1705             fscal            = felec;
1706
1707             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1708
1709              /* Update vectorial force */
1710             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1711             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1712             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1713
1714             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1715             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1716             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1717
1718             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1719             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1720             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1721             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1722
1723             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1724                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1725
1726             /* Inner loop uses 298 flops */
1727         }
1728
1729         /* End of innermost loop */
1730
1731         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1732                                               f+i_coord_offset,fshift+i_shift_offset);
1733
1734         /* Increment number of inner iterations */
1735         inneriter                  += j_index_end - j_index_start;
1736
1737         /* Outer loop uses 18 flops */
1738     }
1739
1740     /* Increment number of outer iterations */
1741     outeriter        += nri;
1742
1743     /* Update outer/inner flops */
1744
1745     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*298);
1746 }