Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_avx_128_fma_double.c
1 /*
2  * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
38  * Electrostatics interaction: Coulomb
39  * VdW interaction:            CubicSplineTable
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54      * just 0 for non-waters.
55      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB;
61     int              j_coord_offsetA,j_coord_offsetB;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwjidx0A,vdwjidx0B;
73     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74     int              vdwjidx1A,vdwjidx1B;
75     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76     int              vdwjidx2A,vdwjidx2B;
77     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
88     real             *charge;
89     int              nvdwtype;
90     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
91     int              *vdwtype;
92     real             *vdwparam;
93     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
94     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
95     __m128i          vfitab;
96     __m128i          ifour       = _mm_set1_epi32(4);
97     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
98     real             *vftab;
99     __m128d          dummy_mask,cutoff_mask;
100     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
101     __m128d          one     = _mm_set1_pd(1.0);
102     __m128d          two     = _mm_set1_pd(2.0);
103     x                = xx[0];
104     f                = ff[0];
105
106     nri              = nlist->nri;
107     iinr             = nlist->iinr;
108     jindex           = nlist->jindex;
109     jjnr             = nlist->jjnr;
110     shiftidx         = nlist->shift;
111     gid              = nlist->gid;
112     shiftvec         = fr->shift_vec[0];
113     fshift           = fr->fshift[0];
114     facel            = _mm_set1_pd(fr->epsfac);
115     charge           = mdatoms->chargeA;
116     nvdwtype         = fr->ntype;
117     vdwparam         = fr->nbfp;
118     vdwtype          = mdatoms->typeA;
119
120     vftab            = kernel_data->table_vdw->data;
121     vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
122
123     /* Setup water-specific parameters */
124     inr              = nlist->iinr[0];
125     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
126     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
127     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
128     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
129
130     jq0              = _mm_set1_pd(charge[inr+0]);
131     jq1              = _mm_set1_pd(charge[inr+1]);
132     jq2              = _mm_set1_pd(charge[inr+2]);
133     vdwjidx0A        = 2*vdwtype[inr+0];
134     qq00             = _mm_mul_pd(iq0,jq0);
135     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
136     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
137     qq01             = _mm_mul_pd(iq0,jq1);
138     qq02             = _mm_mul_pd(iq0,jq2);
139     qq10             = _mm_mul_pd(iq1,jq0);
140     qq11             = _mm_mul_pd(iq1,jq1);
141     qq12             = _mm_mul_pd(iq1,jq2);
142     qq20             = _mm_mul_pd(iq2,jq0);
143     qq21             = _mm_mul_pd(iq2,jq1);
144     qq22             = _mm_mul_pd(iq2,jq2);
145
146     /* Avoid stupid compiler warnings */
147     jnrA = jnrB = 0;
148     j_coord_offsetA = 0;
149     j_coord_offsetB = 0;
150
151     outeriter        = 0;
152     inneriter        = 0;
153
154     /* Start outer loop over neighborlists */
155     for(iidx=0; iidx<nri; iidx++)
156     {
157         /* Load shift vector for this list */
158         i_shift_offset   = DIM*shiftidx[iidx];
159
160         /* Load limits for loop over neighbors */
161         j_index_start    = jindex[iidx];
162         j_index_end      = jindex[iidx+1];
163
164         /* Get outer coordinate index */
165         inr              = iinr[iidx];
166         i_coord_offset   = DIM*inr;
167
168         /* Load i particle coords and add shift vector */
169         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
170                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
171
172         fix0             = _mm_setzero_pd();
173         fiy0             = _mm_setzero_pd();
174         fiz0             = _mm_setzero_pd();
175         fix1             = _mm_setzero_pd();
176         fiy1             = _mm_setzero_pd();
177         fiz1             = _mm_setzero_pd();
178         fix2             = _mm_setzero_pd();
179         fiy2             = _mm_setzero_pd();
180         fiz2             = _mm_setzero_pd();
181
182         /* Reset potential sums */
183         velecsum         = _mm_setzero_pd();
184         vvdwsum          = _mm_setzero_pd();
185
186         /* Start inner kernel loop */
187         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
188         {
189
190             /* Get j neighbor index, and coordinate index */
191             jnrA             = jjnr[jidx];
192             jnrB             = jjnr[jidx+1];
193             j_coord_offsetA  = DIM*jnrA;
194             j_coord_offsetB  = DIM*jnrB;
195
196             /* load j atom coordinates */
197             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
198                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
199
200             /* Calculate displacement vector */
201             dx00             = _mm_sub_pd(ix0,jx0);
202             dy00             = _mm_sub_pd(iy0,jy0);
203             dz00             = _mm_sub_pd(iz0,jz0);
204             dx01             = _mm_sub_pd(ix0,jx1);
205             dy01             = _mm_sub_pd(iy0,jy1);
206             dz01             = _mm_sub_pd(iz0,jz1);
207             dx02             = _mm_sub_pd(ix0,jx2);
208             dy02             = _mm_sub_pd(iy0,jy2);
209             dz02             = _mm_sub_pd(iz0,jz2);
210             dx10             = _mm_sub_pd(ix1,jx0);
211             dy10             = _mm_sub_pd(iy1,jy0);
212             dz10             = _mm_sub_pd(iz1,jz0);
213             dx11             = _mm_sub_pd(ix1,jx1);
214             dy11             = _mm_sub_pd(iy1,jy1);
215             dz11             = _mm_sub_pd(iz1,jz1);
216             dx12             = _mm_sub_pd(ix1,jx2);
217             dy12             = _mm_sub_pd(iy1,jy2);
218             dz12             = _mm_sub_pd(iz1,jz2);
219             dx20             = _mm_sub_pd(ix2,jx0);
220             dy20             = _mm_sub_pd(iy2,jy0);
221             dz20             = _mm_sub_pd(iz2,jz0);
222             dx21             = _mm_sub_pd(ix2,jx1);
223             dy21             = _mm_sub_pd(iy2,jy1);
224             dz21             = _mm_sub_pd(iz2,jz1);
225             dx22             = _mm_sub_pd(ix2,jx2);
226             dy22             = _mm_sub_pd(iy2,jy2);
227             dz22             = _mm_sub_pd(iz2,jz2);
228
229             /* Calculate squared distance and things based on it */
230             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
231             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
232             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
233             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
234             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
235             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
236             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
237             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
238             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
239
240             rinv00           = gmx_mm_invsqrt_pd(rsq00);
241             rinv01           = gmx_mm_invsqrt_pd(rsq01);
242             rinv02           = gmx_mm_invsqrt_pd(rsq02);
243             rinv10           = gmx_mm_invsqrt_pd(rsq10);
244             rinv11           = gmx_mm_invsqrt_pd(rsq11);
245             rinv12           = gmx_mm_invsqrt_pd(rsq12);
246             rinv20           = gmx_mm_invsqrt_pd(rsq20);
247             rinv21           = gmx_mm_invsqrt_pd(rsq21);
248             rinv22           = gmx_mm_invsqrt_pd(rsq22);
249
250             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
251             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
252             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
253             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
254             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
255             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
256             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
257             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
258             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
259
260             fjx0             = _mm_setzero_pd();
261             fjy0             = _mm_setzero_pd();
262             fjz0             = _mm_setzero_pd();
263             fjx1             = _mm_setzero_pd();
264             fjy1             = _mm_setzero_pd();
265             fjz1             = _mm_setzero_pd();
266             fjx2             = _mm_setzero_pd();
267             fjy2             = _mm_setzero_pd();
268             fjz2             = _mm_setzero_pd();
269
270             /**************************
271              * CALCULATE INTERACTIONS *
272              **************************/
273
274             r00              = _mm_mul_pd(rsq00,rinv00);
275
276             /* Calculate table index by multiplying r with table scale and truncate to integer */
277             rt               = _mm_mul_pd(r00,vftabscale);
278             vfitab           = _mm_cvttpd_epi32(rt);
279 #ifdef __XOP__
280             vfeps            = _mm_frcz_pd(rt);
281 #else
282             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
283 #endif
284             twovfeps         = _mm_add_pd(vfeps,vfeps);
285             vfitab           = _mm_slli_epi32(vfitab,3);
286
287             /* COULOMB ELECTROSTATICS */
288             velec            = _mm_mul_pd(qq00,rinv00);
289             felec            = _mm_mul_pd(velec,rinvsq00);
290
291             /* CUBIC SPLINE TABLE DISPERSION */
292             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
293             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
294             GMX_MM_TRANSPOSE2_PD(Y,F);
295             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
296             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
297             GMX_MM_TRANSPOSE2_PD(G,H);
298             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
299             VV               = _mm_macc_pd(vfeps,Fp,Y);
300             vvdw6            = _mm_mul_pd(c6_00,VV);
301             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
302             fvdw6            = _mm_mul_pd(c6_00,FF);
303
304             /* CUBIC SPLINE TABLE REPULSION */
305             vfitab           = _mm_add_epi32(vfitab,ifour);
306             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
307             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
308             GMX_MM_TRANSPOSE2_PD(Y,F);
309             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
310             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
311             GMX_MM_TRANSPOSE2_PD(G,H);
312             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
313             VV               = _mm_macc_pd(vfeps,Fp,Y);
314             vvdw12           = _mm_mul_pd(c12_00,VV);
315             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
316             fvdw12           = _mm_mul_pd(c12_00,FF);
317             vvdw             = _mm_add_pd(vvdw12,vvdw6);
318             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
319
320             /* Update potential sum for this i atom from the interaction with this j atom. */
321             velecsum         = _mm_add_pd(velecsum,velec);
322             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
323
324             fscal            = _mm_add_pd(felec,fvdw);
325
326             /* Update vectorial force */
327             fix0             = _mm_macc_pd(dx00,fscal,fix0);
328             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
329             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
330             
331             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
332             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
333             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
334
335             /**************************
336              * CALCULATE INTERACTIONS *
337              **************************/
338
339             /* COULOMB ELECTROSTATICS */
340             velec            = _mm_mul_pd(qq01,rinv01);
341             felec            = _mm_mul_pd(velec,rinvsq01);
342
343             /* Update potential sum for this i atom from the interaction with this j atom. */
344             velecsum         = _mm_add_pd(velecsum,velec);
345
346             fscal            = felec;
347
348             /* Update vectorial force */
349             fix0             = _mm_macc_pd(dx01,fscal,fix0);
350             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
351             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
352             
353             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
354             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
355             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
356
357             /**************************
358              * CALCULATE INTERACTIONS *
359              **************************/
360
361             /* COULOMB ELECTROSTATICS */
362             velec            = _mm_mul_pd(qq02,rinv02);
363             felec            = _mm_mul_pd(velec,rinvsq02);
364
365             /* Update potential sum for this i atom from the interaction with this j atom. */
366             velecsum         = _mm_add_pd(velecsum,velec);
367
368             fscal            = felec;
369
370             /* Update vectorial force */
371             fix0             = _mm_macc_pd(dx02,fscal,fix0);
372             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
373             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
374             
375             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
376             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
377             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
378
379             /**************************
380              * CALCULATE INTERACTIONS *
381              **************************/
382
383             /* COULOMB ELECTROSTATICS */
384             velec            = _mm_mul_pd(qq10,rinv10);
385             felec            = _mm_mul_pd(velec,rinvsq10);
386
387             /* Update potential sum for this i atom from the interaction with this j atom. */
388             velecsum         = _mm_add_pd(velecsum,velec);
389
390             fscal            = felec;
391
392             /* Update vectorial force */
393             fix1             = _mm_macc_pd(dx10,fscal,fix1);
394             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
395             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
396             
397             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
398             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
399             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
400
401             /**************************
402              * CALCULATE INTERACTIONS *
403              **************************/
404
405             /* COULOMB ELECTROSTATICS */
406             velec            = _mm_mul_pd(qq11,rinv11);
407             felec            = _mm_mul_pd(velec,rinvsq11);
408
409             /* Update potential sum for this i atom from the interaction with this j atom. */
410             velecsum         = _mm_add_pd(velecsum,velec);
411
412             fscal            = felec;
413
414             /* Update vectorial force */
415             fix1             = _mm_macc_pd(dx11,fscal,fix1);
416             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
417             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
418             
419             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
420             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
421             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
422
423             /**************************
424              * CALCULATE INTERACTIONS *
425              **************************/
426
427             /* COULOMB ELECTROSTATICS */
428             velec            = _mm_mul_pd(qq12,rinv12);
429             felec            = _mm_mul_pd(velec,rinvsq12);
430
431             /* Update potential sum for this i atom from the interaction with this j atom. */
432             velecsum         = _mm_add_pd(velecsum,velec);
433
434             fscal            = felec;
435
436             /* Update vectorial force */
437             fix1             = _mm_macc_pd(dx12,fscal,fix1);
438             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
439             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
440             
441             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
442             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
443             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
444
445             /**************************
446              * CALCULATE INTERACTIONS *
447              **************************/
448
449             /* COULOMB ELECTROSTATICS */
450             velec            = _mm_mul_pd(qq20,rinv20);
451             felec            = _mm_mul_pd(velec,rinvsq20);
452
453             /* Update potential sum for this i atom from the interaction with this j atom. */
454             velecsum         = _mm_add_pd(velecsum,velec);
455
456             fscal            = felec;
457
458             /* Update vectorial force */
459             fix2             = _mm_macc_pd(dx20,fscal,fix2);
460             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
461             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
462             
463             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
464             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
465             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
466
467             /**************************
468              * CALCULATE INTERACTIONS *
469              **************************/
470
471             /* COULOMB ELECTROSTATICS */
472             velec            = _mm_mul_pd(qq21,rinv21);
473             felec            = _mm_mul_pd(velec,rinvsq21);
474
475             /* Update potential sum for this i atom from the interaction with this j atom. */
476             velecsum         = _mm_add_pd(velecsum,velec);
477
478             fscal            = felec;
479
480             /* Update vectorial force */
481             fix2             = _mm_macc_pd(dx21,fscal,fix2);
482             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
483             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
484             
485             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
486             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
487             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
488
489             /**************************
490              * CALCULATE INTERACTIONS *
491              **************************/
492
493             /* COULOMB ELECTROSTATICS */
494             velec            = _mm_mul_pd(qq22,rinv22);
495             felec            = _mm_mul_pd(velec,rinvsq22);
496
497             /* Update potential sum for this i atom from the interaction with this j atom. */
498             velecsum         = _mm_add_pd(velecsum,velec);
499
500             fscal            = felec;
501
502             /* Update vectorial force */
503             fix2             = _mm_macc_pd(dx22,fscal,fix2);
504             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
505             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
506             
507             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
508             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
509             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
510
511             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
512
513             /* Inner loop uses 314 flops */
514         }
515
516         if(jidx<j_index_end)
517         {
518
519             jnrA             = jjnr[jidx];
520             j_coord_offsetA  = DIM*jnrA;
521
522             /* load j atom coordinates */
523             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
524                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
525
526             /* Calculate displacement vector */
527             dx00             = _mm_sub_pd(ix0,jx0);
528             dy00             = _mm_sub_pd(iy0,jy0);
529             dz00             = _mm_sub_pd(iz0,jz0);
530             dx01             = _mm_sub_pd(ix0,jx1);
531             dy01             = _mm_sub_pd(iy0,jy1);
532             dz01             = _mm_sub_pd(iz0,jz1);
533             dx02             = _mm_sub_pd(ix0,jx2);
534             dy02             = _mm_sub_pd(iy0,jy2);
535             dz02             = _mm_sub_pd(iz0,jz2);
536             dx10             = _mm_sub_pd(ix1,jx0);
537             dy10             = _mm_sub_pd(iy1,jy0);
538             dz10             = _mm_sub_pd(iz1,jz0);
539             dx11             = _mm_sub_pd(ix1,jx1);
540             dy11             = _mm_sub_pd(iy1,jy1);
541             dz11             = _mm_sub_pd(iz1,jz1);
542             dx12             = _mm_sub_pd(ix1,jx2);
543             dy12             = _mm_sub_pd(iy1,jy2);
544             dz12             = _mm_sub_pd(iz1,jz2);
545             dx20             = _mm_sub_pd(ix2,jx0);
546             dy20             = _mm_sub_pd(iy2,jy0);
547             dz20             = _mm_sub_pd(iz2,jz0);
548             dx21             = _mm_sub_pd(ix2,jx1);
549             dy21             = _mm_sub_pd(iy2,jy1);
550             dz21             = _mm_sub_pd(iz2,jz1);
551             dx22             = _mm_sub_pd(ix2,jx2);
552             dy22             = _mm_sub_pd(iy2,jy2);
553             dz22             = _mm_sub_pd(iz2,jz2);
554
555             /* Calculate squared distance and things based on it */
556             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
557             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
558             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
559             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
560             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
561             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
562             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
563             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
564             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
565
566             rinv00           = gmx_mm_invsqrt_pd(rsq00);
567             rinv01           = gmx_mm_invsqrt_pd(rsq01);
568             rinv02           = gmx_mm_invsqrt_pd(rsq02);
569             rinv10           = gmx_mm_invsqrt_pd(rsq10);
570             rinv11           = gmx_mm_invsqrt_pd(rsq11);
571             rinv12           = gmx_mm_invsqrt_pd(rsq12);
572             rinv20           = gmx_mm_invsqrt_pd(rsq20);
573             rinv21           = gmx_mm_invsqrt_pd(rsq21);
574             rinv22           = gmx_mm_invsqrt_pd(rsq22);
575
576             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
577             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
578             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
579             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
580             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
581             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
582             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
583             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
584             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
585
586             fjx0             = _mm_setzero_pd();
587             fjy0             = _mm_setzero_pd();
588             fjz0             = _mm_setzero_pd();
589             fjx1             = _mm_setzero_pd();
590             fjy1             = _mm_setzero_pd();
591             fjz1             = _mm_setzero_pd();
592             fjx2             = _mm_setzero_pd();
593             fjy2             = _mm_setzero_pd();
594             fjz2             = _mm_setzero_pd();
595
596             /**************************
597              * CALCULATE INTERACTIONS *
598              **************************/
599
600             r00              = _mm_mul_pd(rsq00,rinv00);
601
602             /* Calculate table index by multiplying r with table scale and truncate to integer */
603             rt               = _mm_mul_pd(r00,vftabscale);
604             vfitab           = _mm_cvttpd_epi32(rt);
605 #ifdef __XOP__
606             vfeps            = _mm_frcz_pd(rt);
607 #else
608             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
609 #endif
610             twovfeps         = _mm_add_pd(vfeps,vfeps);
611             vfitab           = _mm_slli_epi32(vfitab,3);
612
613             /* COULOMB ELECTROSTATICS */
614             velec            = _mm_mul_pd(qq00,rinv00);
615             felec            = _mm_mul_pd(velec,rinvsq00);
616
617             /* CUBIC SPLINE TABLE DISPERSION */
618             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
619             F                = _mm_setzero_pd();
620             GMX_MM_TRANSPOSE2_PD(Y,F);
621             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
622             H                = _mm_setzero_pd();
623             GMX_MM_TRANSPOSE2_PD(G,H);
624             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
625             VV               = _mm_macc_pd(vfeps,Fp,Y);
626             vvdw6            = _mm_mul_pd(c6_00,VV);
627             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
628             fvdw6            = _mm_mul_pd(c6_00,FF);
629
630             /* CUBIC SPLINE TABLE REPULSION */
631             vfitab           = _mm_add_epi32(vfitab,ifour);
632             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
633             F                = _mm_setzero_pd();
634             GMX_MM_TRANSPOSE2_PD(Y,F);
635             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
636             H                = _mm_setzero_pd();
637             GMX_MM_TRANSPOSE2_PD(G,H);
638             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
639             VV               = _mm_macc_pd(vfeps,Fp,Y);
640             vvdw12           = _mm_mul_pd(c12_00,VV);
641             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
642             fvdw12           = _mm_mul_pd(c12_00,FF);
643             vvdw             = _mm_add_pd(vvdw12,vvdw6);
644             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
645
646             /* Update potential sum for this i atom from the interaction with this j atom. */
647             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
648             velecsum         = _mm_add_pd(velecsum,velec);
649             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
650             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
651
652             fscal            = _mm_add_pd(felec,fvdw);
653
654             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
655
656             /* Update vectorial force */
657             fix0             = _mm_macc_pd(dx00,fscal,fix0);
658             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
659             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
660             
661             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
662             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
663             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
664
665             /**************************
666              * CALCULATE INTERACTIONS *
667              **************************/
668
669             /* COULOMB ELECTROSTATICS */
670             velec            = _mm_mul_pd(qq01,rinv01);
671             felec            = _mm_mul_pd(velec,rinvsq01);
672
673             /* Update potential sum for this i atom from the interaction with this j atom. */
674             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
675             velecsum         = _mm_add_pd(velecsum,velec);
676
677             fscal            = felec;
678
679             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
680
681             /* Update vectorial force */
682             fix0             = _mm_macc_pd(dx01,fscal,fix0);
683             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
684             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
685             
686             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
687             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
688             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
689
690             /**************************
691              * CALCULATE INTERACTIONS *
692              **************************/
693
694             /* COULOMB ELECTROSTATICS */
695             velec            = _mm_mul_pd(qq02,rinv02);
696             felec            = _mm_mul_pd(velec,rinvsq02);
697
698             /* Update potential sum for this i atom from the interaction with this j atom. */
699             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
700             velecsum         = _mm_add_pd(velecsum,velec);
701
702             fscal            = felec;
703
704             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
705
706             /* Update vectorial force */
707             fix0             = _mm_macc_pd(dx02,fscal,fix0);
708             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
709             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
710             
711             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
712             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
713             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
714
715             /**************************
716              * CALCULATE INTERACTIONS *
717              **************************/
718
719             /* COULOMB ELECTROSTATICS */
720             velec            = _mm_mul_pd(qq10,rinv10);
721             felec            = _mm_mul_pd(velec,rinvsq10);
722
723             /* Update potential sum for this i atom from the interaction with this j atom. */
724             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
725             velecsum         = _mm_add_pd(velecsum,velec);
726
727             fscal            = felec;
728
729             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
730
731             /* Update vectorial force */
732             fix1             = _mm_macc_pd(dx10,fscal,fix1);
733             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
734             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
735             
736             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
737             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
738             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
739
740             /**************************
741              * CALCULATE INTERACTIONS *
742              **************************/
743
744             /* COULOMB ELECTROSTATICS */
745             velec            = _mm_mul_pd(qq11,rinv11);
746             felec            = _mm_mul_pd(velec,rinvsq11);
747
748             /* Update potential sum for this i atom from the interaction with this j atom. */
749             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
750             velecsum         = _mm_add_pd(velecsum,velec);
751
752             fscal            = felec;
753
754             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
755
756             /* Update vectorial force */
757             fix1             = _mm_macc_pd(dx11,fscal,fix1);
758             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
759             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
760             
761             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
762             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
763             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
764
765             /**************************
766              * CALCULATE INTERACTIONS *
767              **************************/
768
769             /* COULOMB ELECTROSTATICS */
770             velec            = _mm_mul_pd(qq12,rinv12);
771             felec            = _mm_mul_pd(velec,rinvsq12);
772
773             /* Update potential sum for this i atom from the interaction with this j atom. */
774             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
775             velecsum         = _mm_add_pd(velecsum,velec);
776
777             fscal            = felec;
778
779             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
780
781             /* Update vectorial force */
782             fix1             = _mm_macc_pd(dx12,fscal,fix1);
783             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
784             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
785             
786             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
787             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
788             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
789
790             /**************************
791              * CALCULATE INTERACTIONS *
792              **************************/
793
794             /* COULOMB ELECTROSTATICS */
795             velec            = _mm_mul_pd(qq20,rinv20);
796             felec            = _mm_mul_pd(velec,rinvsq20);
797
798             /* Update potential sum for this i atom from the interaction with this j atom. */
799             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
800             velecsum         = _mm_add_pd(velecsum,velec);
801
802             fscal            = felec;
803
804             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
805
806             /* Update vectorial force */
807             fix2             = _mm_macc_pd(dx20,fscal,fix2);
808             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
809             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
810             
811             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
812             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
813             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
814
815             /**************************
816              * CALCULATE INTERACTIONS *
817              **************************/
818
819             /* COULOMB ELECTROSTATICS */
820             velec            = _mm_mul_pd(qq21,rinv21);
821             felec            = _mm_mul_pd(velec,rinvsq21);
822
823             /* Update potential sum for this i atom from the interaction with this j atom. */
824             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
825             velecsum         = _mm_add_pd(velecsum,velec);
826
827             fscal            = felec;
828
829             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
830
831             /* Update vectorial force */
832             fix2             = _mm_macc_pd(dx21,fscal,fix2);
833             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
834             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
835             
836             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
837             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
838             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
839
840             /**************************
841              * CALCULATE INTERACTIONS *
842              **************************/
843
844             /* COULOMB ELECTROSTATICS */
845             velec            = _mm_mul_pd(qq22,rinv22);
846             felec            = _mm_mul_pd(velec,rinvsq22);
847
848             /* Update potential sum for this i atom from the interaction with this j atom. */
849             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
850             velecsum         = _mm_add_pd(velecsum,velec);
851
852             fscal            = felec;
853
854             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
855
856             /* Update vectorial force */
857             fix2             = _mm_macc_pd(dx22,fscal,fix2);
858             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
859             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
860             
861             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
862             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
863             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
864
865             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
866
867             /* Inner loop uses 314 flops */
868         }
869
870         /* End of innermost loop */
871
872         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
873                                               f+i_coord_offset,fshift+i_shift_offset);
874
875         ggid                        = gid[iidx];
876         /* Update potential energies */
877         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
878         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
879
880         /* Increment number of inner iterations */
881         inneriter                  += j_index_end - j_index_start;
882
883         /* Outer loop uses 20 flops */
884     }
885
886     /* Increment number of outer iterations */
887     outeriter        += nri;
888
889     /* Update outer/inner flops */
890
891     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*314);
892 }
893 /*
894  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_128_fma_double
895  * Electrostatics interaction: Coulomb
896  * VdW interaction:            CubicSplineTable
897  * Geometry:                   Water3-Water3
898  * Calculate force/pot:        Force
899  */
900 void
901 nb_kernel_ElecCoul_VdwCSTab_GeomW3W3_F_avx_128_fma_double
902                     (t_nblist * gmx_restrict                nlist,
903                      rvec * gmx_restrict                    xx,
904                      rvec * gmx_restrict                    ff,
905                      t_forcerec * gmx_restrict              fr,
906                      t_mdatoms * gmx_restrict               mdatoms,
907                      nb_kernel_data_t * gmx_restrict        kernel_data,
908                      t_nrnb * gmx_restrict                  nrnb)
909 {
910     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
911      * just 0 for non-waters.
912      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
913      * jnr indices corresponding to data put in the four positions in the SIMD register.
914      */
915     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
916     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
917     int              jnrA,jnrB;
918     int              j_coord_offsetA,j_coord_offsetB;
919     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
920     real             rcutoff_scalar;
921     real             *shiftvec,*fshift,*x,*f;
922     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
923     int              vdwioffset0;
924     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
925     int              vdwioffset1;
926     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
927     int              vdwioffset2;
928     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
929     int              vdwjidx0A,vdwjidx0B;
930     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
931     int              vdwjidx1A,vdwjidx1B;
932     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
933     int              vdwjidx2A,vdwjidx2B;
934     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
935     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
936     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
937     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
938     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
939     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
940     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
941     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
942     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
943     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
944     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
945     real             *charge;
946     int              nvdwtype;
947     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
948     int              *vdwtype;
949     real             *vdwparam;
950     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
951     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
952     __m128i          vfitab;
953     __m128i          ifour       = _mm_set1_epi32(4);
954     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
955     real             *vftab;
956     __m128d          dummy_mask,cutoff_mask;
957     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
958     __m128d          one     = _mm_set1_pd(1.0);
959     __m128d          two     = _mm_set1_pd(2.0);
960     x                = xx[0];
961     f                = ff[0];
962
963     nri              = nlist->nri;
964     iinr             = nlist->iinr;
965     jindex           = nlist->jindex;
966     jjnr             = nlist->jjnr;
967     shiftidx         = nlist->shift;
968     gid              = nlist->gid;
969     shiftvec         = fr->shift_vec[0];
970     fshift           = fr->fshift[0];
971     facel            = _mm_set1_pd(fr->epsfac);
972     charge           = mdatoms->chargeA;
973     nvdwtype         = fr->ntype;
974     vdwparam         = fr->nbfp;
975     vdwtype          = mdatoms->typeA;
976
977     vftab            = kernel_data->table_vdw->data;
978     vftabscale       = _mm_set1_pd(kernel_data->table_vdw->scale);
979
980     /* Setup water-specific parameters */
981     inr              = nlist->iinr[0];
982     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
983     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
984     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
985     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
986
987     jq0              = _mm_set1_pd(charge[inr+0]);
988     jq1              = _mm_set1_pd(charge[inr+1]);
989     jq2              = _mm_set1_pd(charge[inr+2]);
990     vdwjidx0A        = 2*vdwtype[inr+0];
991     qq00             = _mm_mul_pd(iq0,jq0);
992     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
993     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
994     qq01             = _mm_mul_pd(iq0,jq1);
995     qq02             = _mm_mul_pd(iq0,jq2);
996     qq10             = _mm_mul_pd(iq1,jq0);
997     qq11             = _mm_mul_pd(iq1,jq1);
998     qq12             = _mm_mul_pd(iq1,jq2);
999     qq20             = _mm_mul_pd(iq2,jq0);
1000     qq21             = _mm_mul_pd(iq2,jq1);
1001     qq22             = _mm_mul_pd(iq2,jq2);
1002
1003     /* Avoid stupid compiler warnings */
1004     jnrA = jnrB = 0;
1005     j_coord_offsetA = 0;
1006     j_coord_offsetB = 0;
1007
1008     outeriter        = 0;
1009     inneriter        = 0;
1010
1011     /* Start outer loop over neighborlists */
1012     for(iidx=0; iidx<nri; iidx++)
1013     {
1014         /* Load shift vector for this list */
1015         i_shift_offset   = DIM*shiftidx[iidx];
1016
1017         /* Load limits for loop over neighbors */
1018         j_index_start    = jindex[iidx];
1019         j_index_end      = jindex[iidx+1];
1020
1021         /* Get outer coordinate index */
1022         inr              = iinr[iidx];
1023         i_coord_offset   = DIM*inr;
1024
1025         /* Load i particle coords and add shift vector */
1026         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1027                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1028
1029         fix0             = _mm_setzero_pd();
1030         fiy0             = _mm_setzero_pd();
1031         fiz0             = _mm_setzero_pd();
1032         fix1             = _mm_setzero_pd();
1033         fiy1             = _mm_setzero_pd();
1034         fiz1             = _mm_setzero_pd();
1035         fix2             = _mm_setzero_pd();
1036         fiy2             = _mm_setzero_pd();
1037         fiz2             = _mm_setzero_pd();
1038
1039         /* Start inner kernel loop */
1040         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1041         {
1042
1043             /* Get j neighbor index, and coordinate index */
1044             jnrA             = jjnr[jidx];
1045             jnrB             = jjnr[jidx+1];
1046             j_coord_offsetA  = DIM*jnrA;
1047             j_coord_offsetB  = DIM*jnrB;
1048
1049             /* load j atom coordinates */
1050             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1051                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1052
1053             /* Calculate displacement vector */
1054             dx00             = _mm_sub_pd(ix0,jx0);
1055             dy00             = _mm_sub_pd(iy0,jy0);
1056             dz00             = _mm_sub_pd(iz0,jz0);
1057             dx01             = _mm_sub_pd(ix0,jx1);
1058             dy01             = _mm_sub_pd(iy0,jy1);
1059             dz01             = _mm_sub_pd(iz0,jz1);
1060             dx02             = _mm_sub_pd(ix0,jx2);
1061             dy02             = _mm_sub_pd(iy0,jy2);
1062             dz02             = _mm_sub_pd(iz0,jz2);
1063             dx10             = _mm_sub_pd(ix1,jx0);
1064             dy10             = _mm_sub_pd(iy1,jy0);
1065             dz10             = _mm_sub_pd(iz1,jz0);
1066             dx11             = _mm_sub_pd(ix1,jx1);
1067             dy11             = _mm_sub_pd(iy1,jy1);
1068             dz11             = _mm_sub_pd(iz1,jz1);
1069             dx12             = _mm_sub_pd(ix1,jx2);
1070             dy12             = _mm_sub_pd(iy1,jy2);
1071             dz12             = _mm_sub_pd(iz1,jz2);
1072             dx20             = _mm_sub_pd(ix2,jx0);
1073             dy20             = _mm_sub_pd(iy2,jy0);
1074             dz20             = _mm_sub_pd(iz2,jz0);
1075             dx21             = _mm_sub_pd(ix2,jx1);
1076             dy21             = _mm_sub_pd(iy2,jy1);
1077             dz21             = _mm_sub_pd(iz2,jz1);
1078             dx22             = _mm_sub_pd(ix2,jx2);
1079             dy22             = _mm_sub_pd(iy2,jy2);
1080             dz22             = _mm_sub_pd(iz2,jz2);
1081
1082             /* Calculate squared distance and things based on it */
1083             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1084             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1085             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1086             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1087             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1088             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1089             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1090             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1091             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1092
1093             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1094             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1095             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1096             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1097             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1098             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1099             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1100             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1101             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1102
1103             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1104             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
1105             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
1106             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
1107             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1108             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1109             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
1110             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1111             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1112
1113             fjx0             = _mm_setzero_pd();
1114             fjy0             = _mm_setzero_pd();
1115             fjz0             = _mm_setzero_pd();
1116             fjx1             = _mm_setzero_pd();
1117             fjy1             = _mm_setzero_pd();
1118             fjz1             = _mm_setzero_pd();
1119             fjx2             = _mm_setzero_pd();
1120             fjy2             = _mm_setzero_pd();
1121             fjz2             = _mm_setzero_pd();
1122
1123             /**************************
1124              * CALCULATE INTERACTIONS *
1125              **************************/
1126
1127             r00              = _mm_mul_pd(rsq00,rinv00);
1128
1129             /* Calculate table index by multiplying r with table scale and truncate to integer */
1130             rt               = _mm_mul_pd(r00,vftabscale);
1131             vfitab           = _mm_cvttpd_epi32(rt);
1132 #ifdef __XOP__
1133             vfeps            = _mm_frcz_pd(rt);
1134 #else
1135             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1136 #endif
1137             twovfeps         = _mm_add_pd(vfeps,vfeps);
1138             vfitab           = _mm_slli_epi32(vfitab,3);
1139
1140             /* COULOMB ELECTROSTATICS */
1141             velec            = _mm_mul_pd(qq00,rinv00);
1142             felec            = _mm_mul_pd(velec,rinvsq00);
1143
1144             /* CUBIC SPLINE TABLE DISPERSION */
1145             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1146             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1147             GMX_MM_TRANSPOSE2_PD(Y,F);
1148             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1149             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1150             GMX_MM_TRANSPOSE2_PD(G,H);
1151             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1152             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1153             fvdw6            = _mm_mul_pd(c6_00,FF);
1154
1155             /* CUBIC SPLINE TABLE REPULSION */
1156             vfitab           = _mm_add_epi32(vfitab,ifour);
1157             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1158             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1159             GMX_MM_TRANSPOSE2_PD(Y,F);
1160             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1161             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1162             GMX_MM_TRANSPOSE2_PD(G,H);
1163             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1164             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1165             fvdw12           = _mm_mul_pd(c12_00,FF);
1166             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1167
1168             fscal            = _mm_add_pd(felec,fvdw);
1169
1170             /* Update vectorial force */
1171             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1172             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1173             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1174             
1175             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1176             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1177             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1178
1179             /**************************
1180              * CALCULATE INTERACTIONS *
1181              **************************/
1182
1183             /* COULOMB ELECTROSTATICS */
1184             velec            = _mm_mul_pd(qq01,rinv01);
1185             felec            = _mm_mul_pd(velec,rinvsq01);
1186
1187             fscal            = felec;
1188
1189             /* Update vectorial force */
1190             fix0             = _mm_macc_pd(dx01,fscal,fix0);
1191             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
1192             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
1193             
1194             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
1195             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
1196             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
1197
1198             /**************************
1199              * CALCULATE INTERACTIONS *
1200              **************************/
1201
1202             /* COULOMB ELECTROSTATICS */
1203             velec            = _mm_mul_pd(qq02,rinv02);
1204             felec            = _mm_mul_pd(velec,rinvsq02);
1205
1206             fscal            = felec;
1207
1208             /* Update vectorial force */
1209             fix0             = _mm_macc_pd(dx02,fscal,fix0);
1210             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
1211             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
1212             
1213             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
1214             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
1215             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
1216
1217             /**************************
1218              * CALCULATE INTERACTIONS *
1219              **************************/
1220
1221             /* COULOMB ELECTROSTATICS */
1222             velec            = _mm_mul_pd(qq10,rinv10);
1223             felec            = _mm_mul_pd(velec,rinvsq10);
1224
1225             fscal            = felec;
1226
1227             /* Update vectorial force */
1228             fix1             = _mm_macc_pd(dx10,fscal,fix1);
1229             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
1230             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
1231             
1232             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
1233             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
1234             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
1235
1236             /**************************
1237              * CALCULATE INTERACTIONS *
1238              **************************/
1239
1240             /* COULOMB ELECTROSTATICS */
1241             velec            = _mm_mul_pd(qq11,rinv11);
1242             felec            = _mm_mul_pd(velec,rinvsq11);
1243
1244             fscal            = felec;
1245
1246             /* Update vectorial force */
1247             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1248             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1249             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1250             
1251             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1252             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1253             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1254
1255             /**************************
1256              * CALCULATE INTERACTIONS *
1257              **************************/
1258
1259             /* COULOMB ELECTROSTATICS */
1260             velec            = _mm_mul_pd(qq12,rinv12);
1261             felec            = _mm_mul_pd(velec,rinvsq12);
1262
1263             fscal            = felec;
1264
1265             /* Update vectorial force */
1266             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1267             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1268             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1269             
1270             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1271             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1272             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1273
1274             /**************************
1275              * CALCULATE INTERACTIONS *
1276              **************************/
1277
1278             /* COULOMB ELECTROSTATICS */
1279             velec            = _mm_mul_pd(qq20,rinv20);
1280             felec            = _mm_mul_pd(velec,rinvsq20);
1281
1282             fscal            = felec;
1283
1284             /* Update vectorial force */
1285             fix2             = _mm_macc_pd(dx20,fscal,fix2);
1286             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
1287             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
1288             
1289             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
1290             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
1291             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
1292
1293             /**************************
1294              * CALCULATE INTERACTIONS *
1295              **************************/
1296
1297             /* COULOMB ELECTROSTATICS */
1298             velec            = _mm_mul_pd(qq21,rinv21);
1299             felec            = _mm_mul_pd(velec,rinvsq21);
1300
1301             fscal            = felec;
1302
1303             /* Update vectorial force */
1304             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1305             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1306             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1307             
1308             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1309             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1310             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1311
1312             /**************************
1313              * CALCULATE INTERACTIONS *
1314              **************************/
1315
1316             /* COULOMB ELECTROSTATICS */
1317             velec            = _mm_mul_pd(qq22,rinv22);
1318             felec            = _mm_mul_pd(velec,rinvsq22);
1319
1320             fscal            = felec;
1321
1322             /* Update vectorial force */
1323             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1324             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1325             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1326             
1327             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1328             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1329             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1330
1331             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1332
1333             /* Inner loop uses 297 flops */
1334         }
1335
1336         if(jidx<j_index_end)
1337         {
1338
1339             jnrA             = jjnr[jidx];
1340             j_coord_offsetA  = DIM*jnrA;
1341
1342             /* load j atom coordinates */
1343             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1344                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1345
1346             /* Calculate displacement vector */
1347             dx00             = _mm_sub_pd(ix0,jx0);
1348             dy00             = _mm_sub_pd(iy0,jy0);
1349             dz00             = _mm_sub_pd(iz0,jz0);
1350             dx01             = _mm_sub_pd(ix0,jx1);
1351             dy01             = _mm_sub_pd(iy0,jy1);
1352             dz01             = _mm_sub_pd(iz0,jz1);
1353             dx02             = _mm_sub_pd(ix0,jx2);
1354             dy02             = _mm_sub_pd(iy0,jy2);
1355             dz02             = _mm_sub_pd(iz0,jz2);
1356             dx10             = _mm_sub_pd(ix1,jx0);
1357             dy10             = _mm_sub_pd(iy1,jy0);
1358             dz10             = _mm_sub_pd(iz1,jz0);
1359             dx11             = _mm_sub_pd(ix1,jx1);
1360             dy11             = _mm_sub_pd(iy1,jy1);
1361             dz11             = _mm_sub_pd(iz1,jz1);
1362             dx12             = _mm_sub_pd(ix1,jx2);
1363             dy12             = _mm_sub_pd(iy1,jy2);
1364             dz12             = _mm_sub_pd(iz1,jz2);
1365             dx20             = _mm_sub_pd(ix2,jx0);
1366             dy20             = _mm_sub_pd(iy2,jy0);
1367             dz20             = _mm_sub_pd(iz2,jz0);
1368             dx21             = _mm_sub_pd(ix2,jx1);
1369             dy21             = _mm_sub_pd(iy2,jy1);
1370             dz21             = _mm_sub_pd(iz2,jz1);
1371             dx22             = _mm_sub_pd(ix2,jx2);
1372             dy22             = _mm_sub_pd(iy2,jy2);
1373             dz22             = _mm_sub_pd(iz2,jz2);
1374
1375             /* Calculate squared distance and things based on it */
1376             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1377             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1378             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1379             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1380             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1381             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1382             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1383             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1384             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1385
1386             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1387             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1388             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1389             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1390             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1391             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1392             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1393             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1394             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1395
1396             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1397             rinvsq01         = _mm_mul_pd(rinv01,rinv01);
1398             rinvsq02         = _mm_mul_pd(rinv02,rinv02);
1399             rinvsq10         = _mm_mul_pd(rinv10,rinv10);
1400             rinvsq11         = _mm_mul_pd(rinv11,rinv11);
1401             rinvsq12         = _mm_mul_pd(rinv12,rinv12);
1402             rinvsq20         = _mm_mul_pd(rinv20,rinv20);
1403             rinvsq21         = _mm_mul_pd(rinv21,rinv21);
1404             rinvsq22         = _mm_mul_pd(rinv22,rinv22);
1405
1406             fjx0             = _mm_setzero_pd();
1407             fjy0             = _mm_setzero_pd();
1408             fjz0             = _mm_setzero_pd();
1409             fjx1             = _mm_setzero_pd();
1410             fjy1             = _mm_setzero_pd();
1411             fjz1             = _mm_setzero_pd();
1412             fjx2             = _mm_setzero_pd();
1413             fjy2             = _mm_setzero_pd();
1414             fjz2             = _mm_setzero_pd();
1415
1416             /**************************
1417              * CALCULATE INTERACTIONS *
1418              **************************/
1419
1420             r00              = _mm_mul_pd(rsq00,rinv00);
1421
1422             /* Calculate table index by multiplying r with table scale and truncate to integer */
1423             rt               = _mm_mul_pd(r00,vftabscale);
1424             vfitab           = _mm_cvttpd_epi32(rt);
1425 #ifdef __XOP__
1426             vfeps            = _mm_frcz_pd(rt);
1427 #else
1428             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1429 #endif
1430             twovfeps         = _mm_add_pd(vfeps,vfeps);
1431             vfitab           = _mm_slli_epi32(vfitab,3);
1432
1433             /* COULOMB ELECTROSTATICS */
1434             velec            = _mm_mul_pd(qq00,rinv00);
1435             felec            = _mm_mul_pd(velec,rinvsq00);
1436
1437             /* CUBIC SPLINE TABLE DISPERSION */
1438             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1439             F                = _mm_setzero_pd();
1440             GMX_MM_TRANSPOSE2_PD(Y,F);
1441             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1442             H                = _mm_setzero_pd();
1443             GMX_MM_TRANSPOSE2_PD(G,H);
1444             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1445             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1446             fvdw6            = _mm_mul_pd(c6_00,FF);
1447
1448             /* CUBIC SPLINE TABLE REPULSION */
1449             vfitab           = _mm_add_epi32(vfitab,ifour);
1450             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1451             F                = _mm_setzero_pd();
1452             GMX_MM_TRANSPOSE2_PD(Y,F);
1453             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1454             H                = _mm_setzero_pd();
1455             GMX_MM_TRANSPOSE2_PD(G,H);
1456             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1457             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1458             fvdw12           = _mm_mul_pd(c12_00,FF);
1459             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1460
1461             fscal            = _mm_add_pd(felec,fvdw);
1462
1463             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1464
1465             /* Update vectorial force */
1466             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1467             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1468             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1469             
1470             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1471             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1472             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1473
1474             /**************************
1475              * CALCULATE INTERACTIONS *
1476              **************************/
1477
1478             /* COULOMB ELECTROSTATICS */
1479             velec            = _mm_mul_pd(qq01,rinv01);
1480             felec            = _mm_mul_pd(velec,rinvsq01);
1481
1482             fscal            = felec;
1483
1484             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1485
1486             /* Update vectorial force */
1487             fix0             = _mm_macc_pd(dx01,fscal,fix0);
1488             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
1489             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
1490             
1491             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
1492             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
1493             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
1494
1495             /**************************
1496              * CALCULATE INTERACTIONS *
1497              **************************/
1498
1499             /* COULOMB ELECTROSTATICS */
1500             velec            = _mm_mul_pd(qq02,rinv02);
1501             felec            = _mm_mul_pd(velec,rinvsq02);
1502
1503             fscal            = felec;
1504
1505             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1506
1507             /* Update vectorial force */
1508             fix0             = _mm_macc_pd(dx02,fscal,fix0);
1509             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
1510             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
1511             
1512             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
1513             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
1514             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
1515
1516             /**************************
1517              * CALCULATE INTERACTIONS *
1518              **************************/
1519
1520             /* COULOMB ELECTROSTATICS */
1521             velec            = _mm_mul_pd(qq10,rinv10);
1522             felec            = _mm_mul_pd(velec,rinvsq10);
1523
1524             fscal            = felec;
1525
1526             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1527
1528             /* Update vectorial force */
1529             fix1             = _mm_macc_pd(dx10,fscal,fix1);
1530             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
1531             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
1532             
1533             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
1534             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
1535             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
1536
1537             /**************************
1538              * CALCULATE INTERACTIONS *
1539              **************************/
1540
1541             /* COULOMB ELECTROSTATICS */
1542             velec            = _mm_mul_pd(qq11,rinv11);
1543             felec            = _mm_mul_pd(velec,rinvsq11);
1544
1545             fscal            = felec;
1546
1547             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1548
1549             /* Update vectorial force */
1550             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1551             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1552             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1553             
1554             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1555             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1556             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1557
1558             /**************************
1559              * CALCULATE INTERACTIONS *
1560              **************************/
1561
1562             /* COULOMB ELECTROSTATICS */
1563             velec            = _mm_mul_pd(qq12,rinv12);
1564             felec            = _mm_mul_pd(velec,rinvsq12);
1565
1566             fscal            = felec;
1567
1568             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1569
1570             /* Update vectorial force */
1571             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1572             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1573             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1574             
1575             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1576             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1577             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1578
1579             /**************************
1580              * CALCULATE INTERACTIONS *
1581              **************************/
1582
1583             /* COULOMB ELECTROSTATICS */
1584             velec            = _mm_mul_pd(qq20,rinv20);
1585             felec            = _mm_mul_pd(velec,rinvsq20);
1586
1587             fscal            = felec;
1588
1589             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1590
1591             /* Update vectorial force */
1592             fix2             = _mm_macc_pd(dx20,fscal,fix2);
1593             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
1594             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
1595             
1596             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
1597             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
1598             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
1599
1600             /**************************
1601              * CALCULATE INTERACTIONS *
1602              **************************/
1603
1604             /* COULOMB ELECTROSTATICS */
1605             velec            = _mm_mul_pd(qq21,rinv21);
1606             felec            = _mm_mul_pd(velec,rinvsq21);
1607
1608             fscal            = felec;
1609
1610             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1611
1612             /* Update vectorial force */
1613             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1614             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1615             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1616             
1617             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1618             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1619             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1620
1621             /**************************
1622              * CALCULATE INTERACTIONS *
1623              **************************/
1624
1625             /* COULOMB ELECTROSTATICS */
1626             velec            = _mm_mul_pd(qq22,rinv22);
1627             felec            = _mm_mul_pd(velec,rinvsq22);
1628
1629             fscal            = felec;
1630
1631             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1632
1633             /* Update vectorial force */
1634             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1635             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1636             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1637             
1638             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1639             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1640             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1641
1642             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1643
1644             /* Inner loop uses 297 flops */
1645         }
1646
1647         /* End of innermost loop */
1648
1649         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1650                                               f+i_coord_offset,fshift+i_shift_offset);
1651
1652         /* Increment number of inner iterations */
1653         inneriter                  += j_index_end - j_index_start;
1654
1655         /* Outer loop uses 18 flops */
1656     }
1657
1658     /* Increment number of outer iterations */
1659     outeriter        += nri;
1660
1661     /* Update outer/inner flops */
1662
1663     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*297);
1664 }