57c38254a9c137aae919c5d95eb3f93da71c48ef
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_avx_128_fma_double.c
1 /*
2  * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_128_fma_double
38  * Electrostatics interaction: CubicSplineTable
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_128_fma_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54      * just 0 for non-waters.
55      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB;
61     int              j_coord_offsetA,j_coord_offsetB;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwjidx0A,vdwjidx0B;
73     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74     int              vdwjidx1A,vdwjidx1B;
75     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76     int              vdwjidx2A,vdwjidx2B;
77     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
88     real             *charge;
89     int              nvdwtype;
90     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
91     int              *vdwtype;
92     real             *vdwparam;
93     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
94     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
95     __m128i          vfitab;
96     __m128i          ifour       = _mm_set1_epi32(4);
97     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
98     real             *vftab;
99     __m128d          dummy_mask,cutoff_mask;
100     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
101     __m128d          one     = _mm_set1_pd(1.0);
102     __m128d          two     = _mm_set1_pd(2.0);
103     x                = xx[0];
104     f                = ff[0];
105
106     nri              = nlist->nri;
107     iinr             = nlist->iinr;
108     jindex           = nlist->jindex;
109     jjnr             = nlist->jjnr;
110     shiftidx         = nlist->shift;
111     gid              = nlist->gid;
112     shiftvec         = fr->shift_vec[0];
113     fshift           = fr->fshift[0];
114     facel            = _mm_set1_pd(fr->epsfac);
115     charge           = mdatoms->chargeA;
116     nvdwtype         = fr->ntype;
117     vdwparam         = fr->nbfp;
118     vdwtype          = mdatoms->typeA;
119
120     vftab            = kernel_data->table_elec->data;
121     vftabscale       = _mm_set1_pd(kernel_data->table_elec->scale);
122
123     /* Setup water-specific parameters */
124     inr              = nlist->iinr[0];
125     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
126     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
127     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
128     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
129
130     jq0              = _mm_set1_pd(charge[inr+0]);
131     jq1              = _mm_set1_pd(charge[inr+1]);
132     jq2              = _mm_set1_pd(charge[inr+2]);
133     vdwjidx0A        = 2*vdwtype[inr+0];
134     qq00             = _mm_mul_pd(iq0,jq0);
135     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
136     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
137     qq01             = _mm_mul_pd(iq0,jq1);
138     qq02             = _mm_mul_pd(iq0,jq2);
139     qq10             = _mm_mul_pd(iq1,jq0);
140     qq11             = _mm_mul_pd(iq1,jq1);
141     qq12             = _mm_mul_pd(iq1,jq2);
142     qq20             = _mm_mul_pd(iq2,jq0);
143     qq21             = _mm_mul_pd(iq2,jq1);
144     qq22             = _mm_mul_pd(iq2,jq2);
145
146     /* Avoid stupid compiler warnings */
147     jnrA = jnrB = 0;
148     j_coord_offsetA = 0;
149     j_coord_offsetB = 0;
150
151     outeriter        = 0;
152     inneriter        = 0;
153
154     /* Start outer loop over neighborlists */
155     for(iidx=0; iidx<nri; iidx++)
156     {
157         /* Load shift vector for this list */
158         i_shift_offset   = DIM*shiftidx[iidx];
159
160         /* Load limits for loop over neighbors */
161         j_index_start    = jindex[iidx];
162         j_index_end      = jindex[iidx+1];
163
164         /* Get outer coordinate index */
165         inr              = iinr[iidx];
166         i_coord_offset   = DIM*inr;
167
168         /* Load i particle coords and add shift vector */
169         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
170                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
171
172         fix0             = _mm_setzero_pd();
173         fiy0             = _mm_setzero_pd();
174         fiz0             = _mm_setzero_pd();
175         fix1             = _mm_setzero_pd();
176         fiy1             = _mm_setzero_pd();
177         fiz1             = _mm_setzero_pd();
178         fix2             = _mm_setzero_pd();
179         fiy2             = _mm_setzero_pd();
180         fiz2             = _mm_setzero_pd();
181
182         /* Reset potential sums */
183         velecsum         = _mm_setzero_pd();
184         vvdwsum          = _mm_setzero_pd();
185
186         /* Start inner kernel loop */
187         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
188         {
189
190             /* Get j neighbor index, and coordinate index */
191             jnrA             = jjnr[jidx];
192             jnrB             = jjnr[jidx+1];
193             j_coord_offsetA  = DIM*jnrA;
194             j_coord_offsetB  = DIM*jnrB;
195
196             /* load j atom coordinates */
197             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
198                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
199
200             /* Calculate displacement vector */
201             dx00             = _mm_sub_pd(ix0,jx0);
202             dy00             = _mm_sub_pd(iy0,jy0);
203             dz00             = _mm_sub_pd(iz0,jz0);
204             dx01             = _mm_sub_pd(ix0,jx1);
205             dy01             = _mm_sub_pd(iy0,jy1);
206             dz01             = _mm_sub_pd(iz0,jz1);
207             dx02             = _mm_sub_pd(ix0,jx2);
208             dy02             = _mm_sub_pd(iy0,jy2);
209             dz02             = _mm_sub_pd(iz0,jz2);
210             dx10             = _mm_sub_pd(ix1,jx0);
211             dy10             = _mm_sub_pd(iy1,jy0);
212             dz10             = _mm_sub_pd(iz1,jz0);
213             dx11             = _mm_sub_pd(ix1,jx1);
214             dy11             = _mm_sub_pd(iy1,jy1);
215             dz11             = _mm_sub_pd(iz1,jz1);
216             dx12             = _mm_sub_pd(ix1,jx2);
217             dy12             = _mm_sub_pd(iy1,jy2);
218             dz12             = _mm_sub_pd(iz1,jz2);
219             dx20             = _mm_sub_pd(ix2,jx0);
220             dy20             = _mm_sub_pd(iy2,jy0);
221             dz20             = _mm_sub_pd(iz2,jz0);
222             dx21             = _mm_sub_pd(ix2,jx1);
223             dy21             = _mm_sub_pd(iy2,jy1);
224             dz21             = _mm_sub_pd(iz2,jz1);
225             dx22             = _mm_sub_pd(ix2,jx2);
226             dy22             = _mm_sub_pd(iy2,jy2);
227             dz22             = _mm_sub_pd(iz2,jz2);
228
229             /* Calculate squared distance and things based on it */
230             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
231             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
232             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
233             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
234             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
235             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
236             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
237             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
238             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
239
240             rinv00           = gmx_mm_invsqrt_pd(rsq00);
241             rinv01           = gmx_mm_invsqrt_pd(rsq01);
242             rinv02           = gmx_mm_invsqrt_pd(rsq02);
243             rinv10           = gmx_mm_invsqrt_pd(rsq10);
244             rinv11           = gmx_mm_invsqrt_pd(rsq11);
245             rinv12           = gmx_mm_invsqrt_pd(rsq12);
246             rinv20           = gmx_mm_invsqrt_pd(rsq20);
247             rinv21           = gmx_mm_invsqrt_pd(rsq21);
248             rinv22           = gmx_mm_invsqrt_pd(rsq22);
249
250             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
251
252             fjx0             = _mm_setzero_pd();
253             fjy0             = _mm_setzero_pd();
254             fjz0             = _mm_setzero_pd();
255             fjx1             = _mm_setzero_pd();
256             fjy1             = _mm_setzero_pd();
257             fjz1             = _mm_setzero_pd();
258             fjx2             = _mm_setzero_pd();
259             fjy2             = _mm_setzero_pd();
260             fjz2             = _mm_setzero_pd();
261
262             /**************************
263              * CALCULATE INTERACTIONS *
264              **************************/
265
266             r00              = _mm_mul_pd(rsq00,rinv00);
267
268             /* Calculate table index by multiplying r with table scale and truncate to integer */
269             rt               = _mm_mul_pd(r00,vftabscale);
270             vfitab           = _mm_cvttpd_epi32(rt);
271 #ifdef __XOP__
272             vfeps            = _mm_frcz_pd(rt);
273 #else
274             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
275 #endif
276             twovfeps         = _mm_add_pd(vfeps,vfeps);
277             vfitab           = _mm_slli_epi32(vfitab,2);
278
279             /* CUBIC SPLINE TABLE ELECTROSTATICS */
280             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
281             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
282             GMX_MM_TRANSPOSE2_PD(Y,F);
283             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
284             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
285             GMX_MM_TRANSPOSE2_PD(G,H);
286             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
287             VV               = _mm_macc_pd(vfeps,Fp,Y);
288             velec            = _mm_mul_pd(qq00,VV);
289             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
290             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
291
292             /* LENNARD-JONES DISPERSION/REPULSION */
293
294             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
295             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
296             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
297             vvdw             = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
298             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
299
300             /* Update potential sum for this i atom from the interaction with this j atom. */
301             velecsum         = _mm_add_pd(velecsum,velec);
302             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
303
304             fscal            = _mm_add_pd(felec,fvdw);
305
306             /* Update vectorial force */
307             fix0             = _mm_macc_pd(dx00,fscal,fix0);
308             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
309             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
310             
311             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
312             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
313             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
314
315             /**************************
316              * CALCULATE INTERACTIONS *
317              **************************/
318
319             r01              = _mm_mul_pd(rsq01,rinv01);
320
321             /* Calculate table index by multiplying r with table scale and truncate to integer */
322             rt               = _mm_mul_pd(r01,vftabscale);
323             vfitab           = _mm_cvttpd_epi32(rt);
324 #ifdef __XOP__
325             vfeps            = _mm_frcz_pd(rt);
326 #else
327             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
328 #endif
329             twovfeps         = _mm_add_pd(vfeps,vfeps);
330             vfitab           = _mm_slli_epi32(vfitab,2);
331
332             /* CUBIC SPLINE TABLE ELECTROSTATICS */
333             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
334             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
335             GMX_MM_TRANSPOSE2_PD(Y,F);
336             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
337             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
338             GMX_MM_TRANSPOSE2_PD(G,H);
339             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
340             VV               = _mm_macc_pd(vfeps,Fp,Y);
341             velec            = _mm_mul_pd(qq01,VV);
342             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
343             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
344
345             /* Update potential sum for this i atom from the interaction with this j atom. */
346             velecsum         = _mm_add_pd(velecsum,velec);
347
348             fscal            = felec;
349
350             /* Update vectorial force */
351             fix0             = _mm_macc_pd(dx01,fscal,fix0);
352             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
353             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
354             
355             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
356             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
357             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
358
359             /**************************
360              * CALCULATE INTERACTIONS *
361              **************************/
362
363             r02              = _mm_mul_pd(rsq02,rinv02);
364
365             /* Calculate table index by multiplying r with table scale and truncate to integer */
366             rt               = _mm_mul_pd(r02,vftabscale);
367             vfitab           = _mm_cvttpd_epi32(rt);
368 #ifdef __XOP__
369             vfeps            = _mm_frcz_pd(rt);
370 #else
371             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
372 #endif
373             twovfeps         = _mm_add_pd(vfeps,vfeps);
374             vfitab           = _mm_slli_epi32(vfitab,2);
375
376             /* CUBIC SPLINE TABLE ELECTROSTATICS */
377             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
378             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
379             GMX_MM_TRANSPOSE2_PD(Y,F);
380             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
381             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
382             GMX_MM_TRANSPOSE2_PD(G,H);
383             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
384             VV               = _mm_macc_pd(vfeps,Fp,Y);
385             velec            = _mm_mul_pd(qq02,VV);
386             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
387             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
388
389             /* Update potential sum for this i atom from the interaction with this j atom. */
390             velecsum         = _mm_add_pd(velecsum,velec);
391
392             fscal            = felec;
393
394             /* Update vectorial force */
395             fix0             = _mm_macc_pd(dx02,fscal,fix0);
396             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
397             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
398             
399             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
400             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
401             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
402
403             /**************************
404              * CALCULATE INTERACTIONS *
405              **************************/
406
407             r10              = _mm_mul_pd(rsq10,rinv10);
408
409             /* Calculate table index by multiplying r with table scale and truncate to integer */
410             rt               = _mm_mul_pd(r10,vftabscale);
411             vfitab           = _mm_cvttpd_epi32(rt);
412 #ifdef __XOP__
413             vfeps            = _mm_frcz_pd(rt);
414 #else
415             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
416 #endif
417             twovfeps         = _mm_add_pd(vfeps,vfeps);
418             vfitab           = _mm_slli_epi32(vfitab,2);
419
420             /* CUBIC SPLINE TABLE ELECTROSTATICS */
421             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
422             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
423             GMX_MM_TRANSPOSE2_PD(Y,F);
424             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
425             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
426             GMX_MM_TRANSPOSE2_PD(G,H);
427             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
428             VV               = _mm_macc_pd(vfeps,Fp,Y);
429             velec            = _mm_mul_pd(qq10,VV);
430             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
431             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
432
433             /* Update potential sum for this i atom from the interaction with this j atom. */
434             velecsum         = _mm_add_pd(velecsum,velec);
435
436             fscal            = felec;
437
438             /* Update vectorial force */
439             fix1             = _mm_macc_pd(dx10,fscal,fix1);
440             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
441             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
442             
443             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
444             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
445             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
446
447             /**************************
448              * CALCULATE INTERACTIONS *
449              **************************/
450
451             r11              = _mm_mul_pd(rsq11,rinv11);
452
453             /* Calculate table index by multiplying r with table scale and truncate to integer */
454             rt               = _mm_mul_pd(r11,vftabscale);
455             vfitab           = _mm_cvttpd_epi32(rt);
456 #ifdef __XOP__
457             vfeps            = _mm_frcz_pd(rt);
458 #else
459             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
460 #endif
461             twovfeps         = _mm_add_pd(vfeps,vfeps);
462             vfitab           = _mm_slli_epi32(vfitab,2);
463
464             /* CUBIC SPLINE TABLE ELECTROSTATICS */
465             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
466             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
467             GMX_MM_TRANSPOSE2_PD(Y,F);
468             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
469             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
470             GMX_MM_TRANSPOSE2_PD(G,H);
471             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
472             VV               = _mm_macc_pd(vfeps,Fp,Y);
473             velec            = _mm_mul_pd(qq11,VV);
474             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
475             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
476
477             /* Update potential sum for this i atom from the interaction with this j atom. */
478             velecsum         = _mm_add_pd(velecsum,velec);
479
480             fscal            = felec;
481
482             /* Update vectorial force */
483             fix1             = _mm_macc_pd(dx11,fscal,fix1);
484             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
485             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
486             
487             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
488             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
489             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
490
491             /**************************
492              * CALCULATE INTERACTIONS *
493              **************************/
494
495             r12              = _mm_mul_pd(rsq12,rinv12);
496
497             /* Calculate table index by multiplying r with table scale and truncate to integer */
498             rt               = _mm_mul_pd(r12,vftabscale);
499             vfitab           = _mm_cvttpd_epi32(rt);
500 #ifdef __XOP__
501             vfeps            = _mm_frcz_pd(rt);
502 #else
503             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
504 #endif
505             twovfeps         = _mm_add_pd(vfeps,vfeps);
506             vfitab           = _mm_slli_epi32(vfitab,2);
507
508             /* CUBIC SPLINE TABLE ELECTROSTATICS */
509             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
510             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
511             GMX_MM_TRANSPOSE2_PD(Y,F);
512             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
513             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
514             GMX_MM_TRANSPOSE2_PD(G,H);
515             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
516             VV               = _mm_macc_pd(vfeps,Fp,Y);
517             velec            = _mm_mul_pd(qq12,VV);
518             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
519             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
520
521             /* Update potential sum for this i atom from the interaction with this j atom. */
522             velecsum         = _mm_add_pd(velecsum,velec);
523
524             fscal            = felec;
525
526             /* Update vectorial force */
527             fix1             = _mm_macc_pd(dx12,fscal,fix1);
528             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
529             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
530             
531             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
532             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
533             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
534
535             /**************************
536              * CALCULATE INTERACTIONS *
537              **************************/
538
539             r20              = _mm_mul_pd(rsq20,rinv20);
540
541             /* Calculate table index by multiplying r with table scale and truncate to integer */
542             rt               = _mm_mul_pd(r20,vftabscale);
543             vfitab           = _mm_cvttpd_epi32(rt);
544 #ifdef __XOP__
545             vfeps            = _mm_frcz_pd(rt);
546 #else
547             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
548 #endif
549             twovfeps         = _mm_add_pd(vfeps,vfeps);
550             vfitab           = _mm_slli_epi32(vfitab,2);
551
552             /* CUBIC SPLINE TABLE ELECTROSTATICS */
553             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
554             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
555             GMX_MM_TRANSPOSE2_PD(Y,F);
556             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
557             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
558             GMX_MM_TRANSPOSE2_PD(G,H);
559             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
560             VV               = _mm_macc_pd(vfeps,Fp,Y);
561             velec            = _mm_mul_pd(qq20,VV);
562             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
563             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
564
565             /* Update potential sum for this i atom from the interaction with this j atom. */
566             velecsum         = _mm_add_pd(velecsum,velec);
567
568             fscal            = felec;
569
570             /* Update vectorial force */
571             fix2             = _mm_macc_pd(dx20,fscal,fix2);
572             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
573             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
574             
575             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
576             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
577             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
578
579             /**************************
580              * CALCULATE INTERACTIONS *
581              **************************/
582
583             r21              = _mm_mul_pd(rsq21,rinv21);
584
585             /* Calculate table index by multiplying r with table scale and truncate to integer */
586             rt               = _mm_mul_pd(r21,vftabscale);
587             vfitab           = _mm_cvttpd_epi32(rt);
588 #ifdef __XOP__
589             vfeps            = _mm_frcz_pd(rt);
590 #else
591             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
592 #endif
593             twovfeps         = _mm_add_pd(vfeps,vfeps);
594             vfitab           = _mm_slli_epi32(vfitab,2);
595
596             /* CUBIC SPLINE TABLE ELECTROSTATICS */
597             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
598             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
599             GMX_MM_TRANSPOSE2_PD(Y,F);
600             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
601             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
602             GMX_MM_TRANSPOSE2_PD(G,H);
603             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
604             VV               = _mm_macc_pd(vfeps,Fp,Y);
605             velec            = _mm_mul_pd(qq21,VV);
606             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
607             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
608
609             /* Update potential sum for this i atom from the interaction with this j atom. */
610             velecsum         = _mm_add_pd(velecsum,velec);
611
612             fscal            = felec;
613
614             /* Update vectorial force */
615             fix2             = _mm_macc_pd(dx21,fscal,fix2);
616             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
617             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
618             
619             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
620             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
621             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
622
623             /**************************
624              * CALCULATE INTERACTIONS *
625              **************************/
626
627             r22              = _mm_mul_pd(rsq22,rinv22);
628
629             /* Calculate table index by multiplying r with table scale and truncate to integer */
630             rt               = _mm_mul_pd(r22,vftabscale);
631             vfitab           = _mm_cvttpd_epi32(rt);
632 #ifdef __XOP__
633             vfeps            = _mm_frcz_pd(rt);
634 #else
635             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
636 #endif
637             twovfeps         = _mm_add_pd(vfeps,vfeps);
638             vfitab           = _mm_slli_epi32(vfitab,2);
639
640             /* CUBIC SPLINE TABLE ELECTROSTATICS */
641             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
642             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
643             GMX_MM_TRANSPOSE2_PD(Y,F);
644             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
645             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
646             GMX_MM_TRANSPOSE2_PD(G,H);
647             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
648             VV               = _mm_macc_pd(vfeps,Fp,Y);
649             velec            = _mm_mul_pd(qq22,VV);
650             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
651             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
652
653             /* Update potential sum for this i atom from the interaction with this j atom. */
654             velecsum         = _mm_add_pd(velecsum,velec);
655
656             fscal            = felec;
657
658             /* Update vectorial force */
659             fix2             = _mm_macc_pd(dx22,fscal,fix2);
660             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
661             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
662             
663             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
664             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
665             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
666
667             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
668
669             /* Inner loop uses 427 flops */
670         }
671
672         if(jidx<j_index_end)
673         {
674
675             jnrA             = jjnr[jidx];
676             j_coord_offsetA  = DIM*jnrA;
677
678             /* load j atom coordinates */
679             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
680                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
681
682             /* Calculate displacement vector */
683             dx00             = _mm_sub_pd(ix0,jx0);
684             dy00             = _mm_sub_pd(iy0,jy0);
685             dz00             = _mm_sub_pd(iz0,jz0);
686             dx01             = _mm_sub_pd(ix0,jx1);
687             dy01             = _mm_sub_pd(iy0,jy1);
688             dz01             = _mm_sub_pd(iz0,jz1);
689             dx02             = _mm_sub_pd(ix0,jx2);
690             dy02             = _mm_sub_pd(iy0,jy2);
691             dz02             = _mm_sub_pd(iz0,jz2);
692             dx10             = _mm_sub_pd(ix1,jx0);
693             dy10             = _mm_sub_pd(iy1,jy0);
694             dz10             = _mm_sub_pd(iz1,jz0);
695             dx11             = _mm_sub_pd(ix1,jx1);
696             dy11             = _mm_sub_pd(iy1,jy1);
697             dz11             = _mm_sub_pd(iz1,jz1);
698             dx12             = _mm_sub_pd(ix1,jx2);
699             dy12             = _mm_sub_pd(iy1,jy2);
700             dz12             = _mm_sub_pd(iz1,jz2);
701             dx20             = _mm_sub_pd(ix2,jx0);
702             dy20             = _mm_sub_pd(iy2,jy0);
703             dz20             = _mm_sub_pd(iz2,jz0);
704             dx21             = _mm_sub_pd(ix2,jx1);
705             dy21             = _mm_sub_pd(iy2,jy1);
706             dz21             = _mm_sub_pd(iz2,jz1);
707             dx22             = _mm_sub_pd(ix2,jx2);
708             dy22             = _mm_sub_pd(iy2,jy2);
709             dz22             = _mm_sub_pd(iz2,jz2);
710
711             /* Calculate squared distance and things based on it */
712             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
713             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
714             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
715             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
716             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
717             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
718             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
719             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
720             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
721
722             rinv00           = gmx_mm_invsqrt_pd(rsq00);
723             rinv01           = gmx_mm_invsqrt_pd(rsq01);
724             rinv02           = gmx_mm_invsqrt_pd(rsq02);
725             rinv10           = gmx_mm_invsqrt_pd(rsq10);
726             rinv11           = gmx_mm_invsqrt_pd(rsq11);
727             rinv12           = gmx_mm_invsqrt_pd(rsq12);
728             rinv20           = gmx_mm_invsqrt_pd(rsq20);
729             rinv21           = gmx_mm_invsqrt_pd(rsq21);
730             rinv22           = gmx_mm_invsqrt_pd(rsq22);
731
732             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
733
734             fjx0             = _mm_setzero_pd();
735             fjy0             = _mm_setzero_pd();
736             fjz0             = _mm_setzero_pd();
737             fjx1             = _mm_setzero_pd();
738             fjy1             = _mm_setzero_pd();
739             fjz1             = _mm_setzero_pd();
740             fjx2             = _mm_setzero_pd();
741             fjy2             = _mm_setzero_pd();
742             fjz2             = _mm_setzero_pd();
743
744             /**************************
745              * CALCULATE INTERACTIONS *
746              **************************/
747
748             r00              = _mm_mul_pd(rsq00,rinv00);
749
750             /* Calculate table index by multiplying r with table scale and truncate to integer */
751             rt               = _mm_mul_pd(r00,vftabscale);
752             vfitab           = _mm_cvttpd_epi32(rt);
753 #ifdef __XOP__
754             vfeps            = _mm_frcz_pd(rt);
755 #else
756             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
757 #endif
758             twovfeps         = _mm_add_pd(vfeps,vfeps);
759             vfitab           = _mm_slli_epi32(vfitab,2);
760
761             /* CUBIC SPLINE TABLE ELECTROSTATICS */
762             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
763             F                = _mm_setzero_pd();
764             GMX_MM_TRANSPOSE2_PD(Y,F);
765             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
766             H                = _mm_setzero_pd();
767             GMX_MM_TRANSPOSE2_PD(G,H);
768             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
769             VV               = _mm_macc_pd(vfeps,Fp,Y);
770             velec            = _mm_mul_pd(qq00,VV);
771             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
772             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
773
774             /* LENNARD-JONES DISPERSION/REPULSION */
775
776             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
777             vvdw6            = _mm_mul_pd(c6_00,rinvsix);
778             vvdw12           = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
779             vvdw             = _mm_msub_pd( vvdw12,one_twelfth, _mm_mul_pd(vvdw6,one_sixth) );
780             fvdw             = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
781
782             /* Update potential sum for this i atom from the interaction with this j atom. */
783             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
784             velecsum         = _mm_add_pd(velecsum,velec);
785             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
786             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
787
788             fscal            = _mm_add_pd(felec,fvdw);
789
790             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
791
792             /* Update vectorial force */
793             fix0             = _mm_macc_pd(dx00,fscal,fix0);
794             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
795             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
796             
797             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
798             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
799             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
800
801             /**************************
802              * CALCULATE INTERACTIONS *
803              **************************/
804
805             r01              = _mm_mul_pd(rsq01,rinv01);
806
807             /* Calculate table index by multiplying r with table scale and truncate to integer */
808             rt               = _mm_mul_pd(r01,vftabscale);
809             vfitab           = _mm_cvttpd_epi32(rt);
810 #ifdef __XOP__
811             vfeps            = _mm_frcz_pd(rt);
812 #else
813             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
814 #endif
815             twovfeps         = _mm_add_pd(vfeps,vfeps);
816             vfitab           = _mm_slli_epi32(vfitab,2);
817
818             /* CUBIC SPLINE TABLE ELECTROSTATICS */
819             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
820             F                = _mm_setzero_pd();
821             GMX_MM_TRANSPOSE2_PD(Y,F);
822             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
823             H                = _mm_setzero_pd();
824             GMX_MM_TRANSPOSE2_PD(G,H);
825             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
826             VV               = _mm_macc_pd(vfeps,Fp,Y);
827             velec            = _mm_mul_pd(qq01,VV);
828             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
829             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
830
831             /* Update potential sum for this i atom from the interaction with this j atom. */
832             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
833             velecsum         = _mm_add_pd(velecsum,velec);
834
835             fscal            = felec;
836
837             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
838
839             /* Update vectorial force */
840             fix0             = _mm_macc_pd(dx01,fscal,fix0);
841             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
842             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
843             
844             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
845             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
846             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
847
848             /**************************
849              * CALCULATE INTERACTIONS *
850              **************************/
851
852             r02              = _mm_mul_pd(rsq02,rinv02);
853
854             /* Calculate table index by multiplying r with table scale and truncate to integer */
855             rt               = _mm_mul_pd(r02,vftabscale);
856             vfitab           = _mm_cvttpd_epi32(rt);
857 #ifdef __XOP__
858             vfeps            = _mm_frcz_pd(rt);
859 #else
860             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
861 #endif
862             twovfeps         = _mm_add_pd(vfeps,vfeps);
863             vfitab           = _mm_slli_epi32(vfitab,2);
864
865             /* CUBIC SPLINE TABLE ELECTROSTATICS */
866             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
867             F                = _mm_setzero_pd();
868             GMX_MM_TRANSPOSE2_PD(Y,F);
869             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
870             H                = _mm_setzero_pd();
871             GMX_MM_TRANSPOSE2_PD(G,H);
872             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
873             VV               = _mm_macc_pd(vfeps,Fp,Y);
874             velec            = _mm_mul_pd(qq02,VV);
875             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
876             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
877
878             /* Update potential sum for this i atom from the interaction with this j atom. */
879             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
880             velecsum         = _mm_add_pd(velecsum,velec);
881
882             fscal            = felec;
883
884             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
885
886             /* Update vectorial force */
887             fix0             = _mm_macc_pd(dx02,fscal,fix0);
888             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
889             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
890             
891             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
892             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
893             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
894
895             /**************************
896              * CALCULATE INTERACTIONS *
897              **************************/
898
899             r10              = _mm_mul_pd(rsq10,rinv10);
900
901             /* Calculate table index by multiplying r with table scale and truncate to integer */
902             rt               = _mm_mul_pd(r10,vftabscale);
903             vfitab           = _mm_cvttpd_epi32(rt);
904 #ifdef __XOP__
905             vfeps            = _mm_frcz_pd(rt);
906 #else
907             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
908 #endif
909             twovfeps         = _mm_add_pd(vfeps,vfeps);
910             vfitab           = _mm_slli_epi32(vfitab,2);
911
912             /* CUBIC SPLINE TABLE ELECTROSTATICS */
913             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
914             F                = _mm_setzero_pd();
915             GMX_MM_TRANSPOSE2_PD(Y,F);
916             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
917             H                = _mm_setzero_pd();
918             GMX_MM_TRANSPOSE2_PD(G,H);
919             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
920             VV               = _mm_macc_pd(vfeps,Fp,Y);
921             velec            = _mm_mul_pd(qq10,VV);
922             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
923             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
924
925             /* Update potential sum for this i atom from the interaction with this j atom. */
926             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
927             velecsum         = _mm_add_pd(velecsum,velec);
928
929             fscal            = felec;
930
931             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
932
933             /* Update vectorial force */
934             fix1             = _mm_macc_pd(dx10,fscal,fix1);
935             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
936             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
937             
938             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
939             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
940             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
941
942             /**************************
943              * CALCULATE INTERACTIONS *
944              **************************/
945
946             r11              = _mm_mul_pd(rsq11,rinv11);
947
948             /* Calculate table index by multiplying r with table scale and truncate to integer */
949             rt               = _mm_mul_pd(r11,vftabscale);
950             vfitab           = _mm_cvttpd_epi32(rt);
951 #ifdef __XOP__
952             vfeps            = _mm_frcz_pd(rt);
953 #else
954             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
955 #endif
956             twovfeps         = _mm_add_pd(vfeps,vfeps);
957             vfitab           = _mm_slli_epi32(vfitab,2);
958
959             /* CUBIC SPLINE TABLE ELECTROSTATICS */
960             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
961             F                = _mm_setzero_pd();
962             GMX_MM_TRANSPOSE2_PD(Y,F);
963             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
964             H                = _mm_setzero_pd();
965             GMX_MM_TRANSPOSE2_PD(G,H);
966             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
967             VV               = _mm_macc_pd(vfeps,Fp,Y);
968             velec            = _mm_mul_pd(qq11,VV);
969             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
970             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
971
972             /* Update potential sum for this i atom from the interaction with this j atom. */
973             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
974             velecsum         = _mm_add_pd(velecsum,velec);
975
976             fscal            = felec;
977
978             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
979
980             /* Update vectorial force */
981             fix1             = _mm_macc_pd(dx11,fscal,fix1);
982             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
983             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
984             
985             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
986             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
987             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
988
989             /**************************
990              * CALCULATE INTERACTIONS *
991              **************************/
992
993             r12              = _mm_mul_pd(rsq12,rinv12);
994
995             /* Calculate table index by multiplying r with table scale and truncate to integer */
996             rt               = _mm_mul_pd(r12,vftabscale);
997             vfitab           = _mm_cvttpd_epi32(rt);
998 #ifdef __XOP__
999             vfeps            = _mm_frcz_pd(rt);
1000 #else
1001             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1002 #endif
1003             twovfeps         = _mm_add_pd(vfeps,vfeps);
1004             vfitab           = _mm_slli_epi32(vfitab,2);
1005
1006             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1007             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1008             F                = _mm_setzero_pd();
1009             GMX_MM_TRANSPOSE2_PD(Y,F);
1010             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1011             H                = _mm_setzero_pd();
1012             GMX_MM_TRANSPOSE2_PD(G,H);
1013             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1014             VV               = _mm_macc_pd(vfeps,Fp,Y);
1015             velec            = _mm_mul_pd(qq12,VV);
1016             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1017             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1018
1019             /* Update potential sum for this i atom from the interaction with this j atom. */
1020             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1021             velecsum         = _mm_add_pd(velecsum,velec);
1022
1023             fscal            = felec;
1024
1025             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1026
1027             /* Update vectorial force */
1028             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1029             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1030             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1031             
1032             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1033             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1034             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1035
1036             /**************************
1037              * CALCULATE INTERACTIONS *
1038              **************************/
1039
1040             r20              = _mm_mul_pd(rsq20,rinv20);
1041
1042             /* Calculate table index by multiplying r with table scale and truncate to integer */
1043             rt               = _mm_mul_pd(r20,vftabscale);
1044             vfitab           = _mm_cvttpd_epi32(rt);
1045 #ifdef __XOP__
1046             vfeps            = _mm_frcz_pd(rt);
1047 #else
1048             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1049 #endif
1050             twovfeps         = _mm_add_pd(vfeps,vfeps);
1051             vfitab           = _mm_slli_epi32(vfitab,2);
1052
1053             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1054             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1055             F                = _mm_setzero_pd();
1056             GMX_MM_TRANSPOSE2_PD(Y,F);
1057             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1058             H                = _mm_setzero_pd();
1059             GMX_MM_TRANSPOSE2_PD(G,H);
1060             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1061             VV               = _mm_macc_pd(vfeps,Fp,Y);
1062             velec            = _mm_mul_pd(qq20,VV);
1063             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1064             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1065
1066             /* Update potential sum for this i atom from the interaction with this j atom. */
1067             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1068             velecsum         = _mm_add_pd(velecsum,velec);
1069
1070             fscal            = felec;
1071
1072             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1073
1074             /* Update vectorial force */
1075             fix2             = _mm_macc_pd(dx20,fscal,fix2);
1076             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
1077             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
1078             
1079             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
1080             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
1081             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
1082
1083             /**************************
1084              * CALCULATE INTERACTIONS *
1085              **************************/
1086
1087             r21              = _mm_mul_pd(rsq21,rinv21);
1088
1089             /* Calculate table index by multiplying r with table scale and truncate to integer */
1090             rt               = _mm_mul_pd(r21,vftabscale);
1091             vfitab           = _mm_cvttpd_epi32(rt);
1092 #ifdef __XOP__
1093             vfeps            = _mm_frcz_pd(rt);
1094 #else
1095             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1096 #endif
1097             twovfeps         = _mm_add_pd(vfeps,vfeps);
1098             vfitab           = _mm_slli_epi32(vfitab,2);
1099
1100             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1101             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1102             F                = _mm_setzero_pd();
1103             GMX_MM_TRANSPOSE2_PD(Y,F);
1104             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1105             H                = _mm_setzero_pd();
1106             GMX_MM_TRANSPOSE2_PD(G,H);
1107             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1108             VV               = _mm_macc_pd(vfeps,Fp,Y);
1109             velec            = _mm_mul_pd(qq21,VV);
1110             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1111             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1112
1113             /* Update potential sum for this i atom from the interaction with this j atom. */
1114             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1115             velecsum         = _mm_add_pd(velecsum,velec);
1116
1117             fscal            = felec;
1118
1119             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1120
1121             /* Update vectorial force */
1122             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1123             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1124             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1125             
1126             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1127             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1128             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1129
1130             /**************************
1131              * CALCULATE INTERACTIONS *
1132              **************************/
1133
1134             r22              = _mm_mul_pd(rsq22,rinv22);
1135
1136             /* Calculate table index by multiplying r with table scale and truncate to integer */
1137             rt               = _mm_mul_pd(r22,vftabscale);
1138             vfitab           = _mm_cvttpd_epi32(rt);
1139 #ifdef __XOP__
1140             vfeps            = _mm_frcz_pd(rt);
1141 #else
1142             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1143 #endif
1144             twovfeps         = _mm_add_pd(vfeps,vfeps);
1145             vfitab           = _mm_slli_epi32(vfitab,2);
1146
1147             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1148             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1149             F                = _mm_setzero_pd();
1150             GMX_MM_TRANSPOSE2_PD(Y,F);
1151             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1152             H                = _mm_setzero_pd();
1153             GMX_MM_TRANSPOSE2_PD(G,H);
1154             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1155             VV               = _mm_macc_pd(vfeps,Fp,Y);
1156             velec            = _mm_mul_pd(qq22,VV);
1157             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1158             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1159
1160             /* Update potential sum for this i atom from the interaction with this j atom. */
1161             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1162             velecsum         = _mm_add_pd(velecsum,velec);
1163
1164             fscal            = felec;
1165
1166             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1167
1168             /* Update vectorial force */
1169             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1170             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1171             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1172             
1173             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1174             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1175             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1176
1177             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1178
1179             /* Inner loop uses 427 flops */
1180         }
1181
1182         /* End of innermost loop */
1183
1184         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1185                                               f+i_coord_offset,fshift+i_shift_offset);
1186
1187         ggid                        = gid[iidx];
1188         /* Update potential energies */
1189         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1190         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1191
1192         /* Increment number of inner iterations */
1193         inneriter                  += j_index_end - j_index_start;
1194
1195         /* Outer loop uses 20 flops */
1196     }
1197
1198     /* Increment number of outer iterations */
1199     outeriter        += nri;
1200
1201     /* Update outer/inner flops */
1202
1203     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*427);
1204 }
1205 /*
1206  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_128_fma_double
1207  * Electrostatics interaction: CubicSplineTable
1208  * VdW interaction:            LennardJones
1209  * Geometry:                   Water3-Water3
1210  * Calculate force/pot:        Force
1211  */
1212 void
1213 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_128_fma_double
1214                     (t_nblist * gmx_restrict                nlist,
1215                      rvec * gmx_restrict                    xx,
1216                      rvec * gmx_restrict                    ff,
1217                      t_forcerec * gmx_restrict              fr,
1218                      t_mdatoms * gmx_restrict               mdatoms,
1219                      nb_kernel_data_t * gmx_restrict        kernel_data,
1220                      t_nrnb * gmx_restrict                  nrnb)
1221 {
1222     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1223      * just 0 for non-waters.
1224      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1225      * jnr indices corresponding to data put in the four positions in the SIMD register.
1226      */
1227     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1228     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1229     int              jnrA,jnrB;
1230     int              j_coord_offsetA,j_coord_offsetB;
1231     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1232     real             rcutoff_scalar;
1233     real             *shiftvec,*fshift,*x,*f;
1234     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1235     int              vdwioffset0;
1236     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1237     int              vdwioffset1;
1238     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1239     int              vdwioffset2;
1240     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1241     int              vdwjidx0A,vdwjidx0B;
1242     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1243     int              vdwjidx1A,vdwjidx1B;
1244     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1245     int              vdwjidx2A,vdwjidx2B;
1246     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1247     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1248     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1249     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1250     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1251     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1252     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1253     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1254     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1255     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1256     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1257     real             *charge;
1258     int              nvdwtype;
1259     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1260     int              *vdwtype;
1261     real             *vdwparam;
1262     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1263     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1264     __m128i          vfitab;
1265     __m128i          ifour       = _mm_set1_epi32(4);
1266     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
1267     real             *vftab;
1268     __m128d          dummy_mask,cutoff_mask;
1269     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1270     __m128d          one     = _mm_set1_pd(1.0);
1271     __m128d          two     = _mm_set1_pd(2.0);
1272     x                = xx[0];
1273     f                = ff[0];
1274
1275     nri              = nlist->nri;
1276     iinr             = nlist->iinr;
1277     jindex           = nlist->jindex;
1278     jjnr             = nlist->jjnr;
1279     shiftidx         = nlist->shift;
1280     gid              = nlist->gid;
1281     shiftvec         = fr->shift_vec[0];
1282     fshift           = fr->fshift[0];
1283     facel            = _mm_set1_pd(fr->epsfac);
1284     charge           = mdatoms->chargeA;
1285     nvdwtype         = fr->ntype;
1286     vdwparam         = fr->nbfp;
1287     vdwtype          = mdatoms->typeA;
1288
1289     vftab            = kernel_data->table_elec->data;
1290     vftabscale       = _mm_set1_pd(kernel_data->table_elec->scale);
1291
1292     /* Setup water-specific parameters */
1293     inr              = nlist->iinr[0];
1294     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1295     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1296     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1297     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1298
1299     jq0              = _mm_set1_pd(charge[inr+0]);
1300     jq1              = _mm_set1_pd(charge[inr+1]);
1301     jq2              = _mm_set1_pd(charge[inr+2]);
1302     vdwjidx0A        = 2*vdwtype[inr+0];
1303     qq00             = _mm_mul_pd(iq0,jq0);
1304     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1305     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1306     qq01             = _mm_mul_pd(iq0,jq1);
1307     qq02             = _mm_mul_pd(iq0,jq2);
1308     qq10             = _mm_mul_pd(iq1,jq0);
1309     qq11             = _mm_mul_pd(iq1,jq1);
1310     qq12             = _mm_mul_pd(iq1,jq2);
1311     qq20             = _mm_mul_pd(iq2,jq0);
1312     qq21             = _mm_mul_pd(iq2,jq1);
1313     qq22             = _mm_mul_pd(iq2,jq2);
1314
1315     /* Avoid stupid compiler warnings */
1316     jnrA = jnrB = 0;
1317     j_coord_offsetA = 0;
1318     j_coord_offsetB = 0;
1319
1320     outeriter        = 0;
1321     inneriter        = 0;
1322
1323     /* Start outer loop over neighborlists */
1324     for(iidx=0; iidx<nri; iidx++)
1325     {
1326         /* Load shift vector for this list */
1327         i_shift_offset   = DIM*shiftidx[iidx];
1328
1329         /* Load limits for loop over neighbors */
1330         j_index_start    = jindex[iidx];
1331         j_index_end      = jindex[iidx+1];
1332
1333         /* Get outer coordinate index */
1334         inr              = iinr[iidx];
1335         i_coord_offset   = DIM*inr;
1336
1337         /* Load i particle coords and add shift vector */
1338         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1339                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1340
1341         fix0             = _mm_setzero_pd();
1342         fiy0             = _mm_setzero_pd();
1343         fiz0             = _mm_setzero_pd();
1344         fix1             = _mm_setzero_pd();
1345         fiy1             = _mm_setzero_pd();
1346         fiz1             = _mm_setzero_pd();
1347         fix2             = _mm_setzero_pd();
1348         fiy2             = _mm_setzero_pd();
1349         fiz2             = _mm_setzero_pd();
1350
1351         /* Start inner kernel loop */
1352         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1353         {
1354
1355             /* Get j neighbor index, and coordinate index */
1356             jnrA             = jjnr[jidx];
1357             jnrB             = jjnr[jidx+1];
1358             j_coord_offsetA  = DIM*jnrA;
1359             j_coord_offsetB  = DIM*jnrB;
1360
1361             /* load j atom coordinates */
1362             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1363                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1364
1365             /* Calculate displacement vector */
1366             dx00             = _mm_sub_pd(ix0,jx0);
1367             dy00             = _mm_sub_pd(iy0,jy0);
1368             dz00             = _mm_sub_pd(iz0,jz0);
1369             dx01             = _mm_sub_pd(ix0,jx1);
1370             dy01             = _mm_sub_pd(iy0,jy1);
1371             dz01             = _mm_sub_pd(iz0,jz1);
1372             dx02             = _mm_sub_pd(ix0,jx2);
1373             dy02             = _mm_sub_pd(iy0,jy2);
1374             dz02             = _mm_sub_pd(iz0,jz2);
1375             dx10             = _mm_sub_pd(ix1,jx0);
1376             dy10             = _mm_sub_pd(iy1,jy0);
1377             dz10             = _mm_sub_pd(iz1,jz0);
1378             dx11             = _mm_sub_pd(ix1,jx1);
1379             dy11             = _mm_sub_pd(iy1,jy1);
1380             dz11             = _mm_sub_pd(iz1,jz1);
1381             dx12             = _mm_sub_pd(ix1,jx2);
1382             dy12             = _mm_sub_pd(iy1,jy2);
1383             dz12             = _mm_sub_pd(iz1,jz2);
1384             dx20             = _mm_sub_pd(ix2,jx0);
1385             dy20             = _mm_sub_pd(iy2,jy0);
1386             dz20             = _mm_sub_pd(iz2,jz0);
1387             dx21             = _mm_sub_pd(ix2,jx1);
1388             dy21             = _mm_sub_pd(iy2,jy1);
1389             dz21             = _mm_sub_pd(iz2,jz1);
1390             dx22             = _mm_sub_pd(ix2,jx2);
1391             dy22             = _mm_sub_pd(iy2,jy2);
1392             dz22             = _mm_sub_pd(iz2,jz2);
1393
1394             /* Calculate squared distance and things based on it */
1395             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1396             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1397             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1398             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1399             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1400             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1401             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1402             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1403             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1404
1405             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1406             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1407             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1408             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1409             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1410             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1411             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1412             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1413             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1414
1415             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1416
1417             fjx0             = _mm_setzero_pd();
1418             fjy0             = _mm_setzero_pd();
1419             fjz0             = _mm_setzero_pd();
1420             fjx1             = _mm_setzero_pd();
1421             fjy1             = _mm_setzero_pd();
1422             fjz1             = _mm_setzero_pd();
1423             fjx2             = _mm_setzero_pd();
1424             fjy2             = _mm_setzero_pd();
1425             fjz2             = _mm_setzero_pd();
1426
1427             /**************************
1428              * CALCULATE INTERACTIONS *
1429              **************************/
1430
1431             r00              = _mm_mul_pd(rsq00,rinv00);
1432
1433             /* Calculate table index by multiplying r with table scale and truncate to integer */
1434             rt               = _mm_mul_pd(r00,vftabscale);
1435             vfitab           = _mm_cvttpd_epi32(rt);
1436 #ifdef __XOP__
1437             vfeps            = _mm_frcz_pd(rt);
1438 #else
1439             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1440 #endif
1441             twovfeps         = _mm_add_pd(vfeps,vfeps);
1442             vfitab           = _mm_slli_epi32(vfitab,2);
1443
1444             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1445             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1446             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1447             GMX_MM_TRANSPOSE2_PD(Y,F);
1448             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1449             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1450             GMX_MM_TRANSPOSE2_PD(G,H);
1451             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1452             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1453             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1454
1455             /* LENNARD-JONES DISPERSION/REPULSION */
1456
1457             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1458             fvdw             = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1459
1460             fscal            = _mm_add_pd(felec,fvdw);
1461
1462             /* Update vectorial force */
1463             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1464             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1465             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1466             
1467             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1468             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1469             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1470
1471             /**************************
1472              * CALCULATE INTERACTIONS *
1473              **************************/
1474
1475             r01              = _mm_mul_pd(rsq01,rinv01);
1476
1477             /* Calculate table index by multiplying r with table scale and truncate to integer */
1478             rt               = _mm_mul_pd(r01,vftabscale);
1479             vfitab           = _mm_cvttpd_epi32(rt);
1480 #ifdef __XOP__
1481             vfeps            = _mm_frcz_pd(rt);
1482 #else
1483             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1484 #endif
1485             twovfeps         = _mm_add_pd(vfeps,vfeps);
1486             vfitab           = _mm_slli_epi32(vfitab,2);
1487
1488             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1489             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1490             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1491             GMX_MM_TRANSPOSE2_PD(Y,F);
1492             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1493             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1494             GMX_MM_TRANSPOSE2_PD(G,H);
1495             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1496             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1497             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1498
1499             fscal            = felec;
1500
1501             /* Update vectorial force */
1502             fix0             = _mm_macc_pd(dx01,fscal,fix0);
1503             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
1504             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
1505             
1506             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
1507             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
1508             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
1509
1510             /**************************
1511              * CALCULATE INTERACTIONS *
1512              **************************/
1513
1514             r02              = _mm_mul_pd(rsq02,rinv02);
1515
1516             /* Calculate table index by multiplying r with table scale and truncate to integer */
1517             rt               = _mm_mul_pd(r02,vftabscale);
1518             vfitab           = _mm_cvttpd_epi32(rt);
1519 #ifdef __XOP__
1520             vfeps            = _mm_frcz_pd(rt);
1521 #else
1522             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1523 #endif
1524             twovfeps         = _mm_add_pd(vfeps,vfeps);
1525             vfitab           = _mm_slli_epi32(vfitab,2);
1526
1527             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1528             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1529             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1530             GMX_MM_TRANSPOSE2_PD(Y,F);
1531             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1532             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1533             GMX_MM_TRANSPOSE2_PD(G,H);
1534             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1535             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1536             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1537
1538             fscal            = felec;
1539
1540             /* Update vectorial force */
1541             fix0             = _mm_macc_pd(dx02,fscal,fix0);
1542             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
1543             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
1544             
1545             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
1546             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
1547             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
1548
1549             /**************************
1550              * CALCULATE INTERACTIONS *
1551              **************************/
1552
1553             r10              = _mm_mul_pd(rsq10,rinv10);
1554
1555             /* Calculate table index by multiplying r with table scale and truncate to integer */
1556             rt               = _mm_mul_pd(r10,vftabscale);
1557             vfitab           = _mm_cvttpd_epi32(rt);
1558 #ifdef __XOP__
1559             vfeps            = _mm_frcz_pd(rt);
1560 #else
1561             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1562 #endif
1563             twovfeps         = _mm_add_pd(vfeps,vfeps);
1564             vfitab           = _mm_slli_epi32(vfitab,2);
1565
1566             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1567             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1568             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1569             GMX_MM_TRANSPOSE2_PD(Y,F);
1570             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1571             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1572             GMX_MM_TRANSPOSE2_PD(G,H);
1573             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1574             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1575             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1576
1577             fscal            = felec;
1578
1579             /* Update vectorial force */
1580             fix1             = _mm_macc_pd(dx10,fscal,fix1);
1581             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
1582             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
1583             
1584             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
1585             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
1586             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
1587
1588             /**************************
1589              * CALCULATE INTERACTIONS *
1590              **************************/
1591
1592             r11              = _mm_mul_pd(rsq11,rinv11);
1593
1594             /* Calculate table index by multiplying r with table scale and truncate to integer */
1595             rt               = _mm_mul_pd(r11,vftabscale);
1596             vfitab           = _mm_cvttpd_epi32(rt);
1597 #ifdef __XOP__
1598             vfeps            = _mm_frcz_pd(rt);
1599 #else
1600             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1601 #endif
1602             twovfeps         = _mm_add_pd(vfeps,vfeps);
1603             vfitab           = _mm_slli_epi32(vfitab,2);
1604
1605             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1606             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1607             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1608             GMX_MM_TRANSPOSE2_PD(Y,F);
1609             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1610             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1611             GMX_MM_TRANSPOSE2_PD(G,H);
1612             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1613             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1614             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1615
1616             fscal            = felec;
1617
1618             /* Update vectorial force */
1619             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1620             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1621             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1622             
1623             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1624             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1625             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1626
1627             /**************************
1628              * CALCULATE INTERACTIONS *
1629              **************************/
1630
1631             r12              = _mm_mul_pd(rsq12,rinv12);
1632
1633             /* Calculate table index by multiplying r with table scale and truncate to integer */
1634             rt               = _mm_mul_pd(r12,vftabscale);
1635             vfitab           = _mm_cvttpd_epi32(rt);
1636 #ifdef __XOP__
1637             vfeps            = _mm_frcz_pd(rt);
1638 #else
1639             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1640 #endif
1641             twovfeps         = _mm_add_pd(vfeps,vfeps);
1642             vfitab           = _mm_slli_epi32(vfitab,2);
1643
1644             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1645             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1646             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1647             GMX_MM_TRANSPOSE2_PD(Y,F);
1648             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1649             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1650             GMX_MM_TRANSPOSE2_PD(G,H);
1651             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1652             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1653             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1654
1655             fscal            = felec;
1656
1657             /* Update vectorial force */
1658             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1659             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1660             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1661             
1662             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1663             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1664             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1665
1666             /**************************
1667              * CALCULATE INTERACTIONS *
1668              **************************/
1669
1670             r20              = _mm_mul_pd(rsq20,rinv20);
1671
1672             /* Calculate table index by multiplying r with table scale and truncate to integer */
1673             rt               = _mm_mul_pd(r20,vftabscale);
1674             vfitab           = _mm_cvttpd_epi32(rt);
1675 #ifdef __XOP__
1676             vfeps            = _mm_frcz_pd(rt);
1677 #else
1678             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1679 #endif
1680             twovfeps         = _mm_add_pd(vfeps,vfeps);
1681             vfitab           = _mm_slli_epi32(vfitab,2);
1682
1683             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1684             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1685             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1686             GMX_MM_TRANSPOSE2_PD(Y,F);
1687             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1688             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1689             GMX_MM_TRANSPOSE2_PD(G,H);
1690             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1691             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1692             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1693
1694             fscal            = felec;
1695
1696             /* Update vectorial force */
1697             fix2             = _mm_macc_pd(dx20,fscal,fix2);
1698             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
1699             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
1700             
1701             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
1702             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
1703             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
1704
1705             /**************************
1706              * CALCULATE INTERACTIONS *
1707              **************************/
1708
1709             r21              = _mm_mul_pd(rsq21,rinv21);
1710
1711             /* Calculate table index by multiplying r with table scale and truncate to integer */
1712             rt               = _mm_mul_pd(r21,vftabscale);
1713             vfitab           = _mm_cvttpd_epi32(rt);
1714 #ifdef __XOP__
1715             vfeps            = _mm_frcz_pd(rt);
1716 #else
1717             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1718 #endif
1719             twovfeps         = _mm_add_pd(vfeps,vfeps);
1720             vfitab           = _mm_slli_epi32(vfitab,2);
1721
1722             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1723             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1724             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1725             GMX_MM_TRANSPOSE2_PD(Y,F);
1726             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1727             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1728             GMX_MM_TRANSPOSE2_PD(G,H);
1729             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1730             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1731             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1732
1733             fscal            = felec;
1734
1735             /* Update vectorial force */
1736             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1737             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1738             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1739             
1740             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1741             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1742             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1743
1744             /**************************
1745              * CALCULATE INTERACTIONS *
1746              **************************/
1747
1748             r22              = _mm_mul_pd(rsq22,rinv22);
1749
1750             /* Calculate table index by multiplying r with table scale and truncate to integer */
1751             rt               = _mm_mul_pd(r22,vftabscale);
1752             vfitab           = _mm_cvttpd_epi32(rt);
1753 #ifdef __XOP__
1754             vfeps            = _mm_frcz_pd(rt);
1755 #else
1756             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1757 #endif
1758             twovfeps         = _mm_add_pd(vfeps,vfeps);
1759             vfitab           = _mm_slli_epi32(vfitab,2);
1760
1761             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1762             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1763             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1764             GMX_MM_TRANSPOSE2_PD(Y,F);
1765             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1766             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1767             GMX_MM_TRANSPOSE2_PD(G,H);
1768             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1769             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1770             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1771
1772             fscal            = felec;
1773
1774             /* Update vectorial force */
1775             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1776             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1777             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1778             
1779             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1780             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1781             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1782
1783             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1784
1785             /* Inner loop uses 386 flops */
1786         }
1787
1788         if(jidx<j_index_end)
1789         {
1790
1791             jnrA             = jjnr[jidx];
1792             j_coord_offsetA  = DIM*jnrA;
1793
1794             /* load j atom coordinates */
1795             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1796                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1797
1798             /* Calculate displacement vector */
1799             dx00             = _mm_sub_pd(ix0,jx0);
1800             dy00             = _mm_sub_pd(iy0,jy0);
1801             dz00             = _mm_sub_pd(iz0,jz0);
1802             dx01             = _mm_sub_pd(ix0,jx1);
1803             dy01             = _mm_sub_pd(iy0,jy1);
1804             dz01             = _mm_sub_pd(iz0,jz1);
1805             dx02             = _mm_sub_pd(ix0,jx2);
1806             dy02             = _mm_sub_pd(iy0,jy2);
1807             dz02             = _mm_sub_pd(iz0,jz2);
1808             dx10             = _mm_sub_pd(ix1,jx0);
1809             dy10             = _mm_sub_pd(iy1,jy0);
1810             dz10             = _mm_sub_pd(iz1,jz0);
1811             dx11             = _mm_sub_pd(ix1,jx1);
1812             dy11             = _mm_sub_pd(iy1,jy1);
1813             dz11             = _mm_sub_pd(iz1,jz1);
1814             dx12             = _mm_sub_pd(ix1,jx2);
1815             dy12             = _mm_sub_pd(iy1,jy2);
1816             dz12             = _mm_sub_pd(iz1,jz2);
1817             dx20             = _mm_sub_pd(ix2,jx0);
1818             dy20             = _mm_sub_pd(iy2,jy0);
1819             dz20             = _mm_sub_pd(iz2,jz0);
1820             dx21             = _mm_sub_pd(ix2,jx1);
1821             dy21             = _mm_sub_pd(iy2,jy1);
1822             dz21             = _mm_sub_pd(iz2,jz1);
1823             dx22             = _mm_sub_pd(ix2,jx2);
1824             dy22             = _mm_sub_pd(iy2,jy2);
1825             dz22             = _mm_sub_pd(iz2,jz2);
1826
1827             /* Calculate squared distance and things based on it */
1828             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1829             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1830             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1831             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1832             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1833             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1834             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1835             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1836             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1837
1838             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1839             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1840             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1841             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1842             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1843             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1844             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1845             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1846             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1847
1848             rinvsq00         = _mm_mul_pd(rinv00,rinv00);
1849
1850             fjx0             = _mm_setzero_pd();
1851             fjy0             = _mm_setzero_pd();
1852             fjz0             = _mm_setzero_pd();
1853             fjx1             = _mm_setzero_pd();
1854             fjy1             = _mm_setzero_pd();
1855             fjz1             = _mm_setzero_pd();
1856             fjx2             = _mm_setzero_pd();
1857             fjy2             = _mm_setzero_pd();
1858             fjz2             = _mm_setzero_pd();
1859
1860             /**************************
1861              * CALCULATE INTERACTIONS *
1862              **************************/
1863
1864             r00              = _mm_mul_pd(rsq00,rinv00);
1865
1866             /* Calculate table index by multiplying r with table scale and truncate to integer */
1867             rt               = _mm_mul_pd(r00,vftabscale);
1868             vfitab           = _mm_cvttpd_epi32(rt);
1869 #ifdef __XOP__
1870             vfeps            = _mm_frcz_pd(rt);
1871 #else
1872             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1873 #endif
1874             twovfeps         = _mm_add_pd(vfeps,vfeps);
1875             vfitab           = _mm_slli_epi32(vfitab,2);
1876
1877             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1878             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1879             F                = _mm_setzero_pd();
1880             GMX_MM_TRANSPOSE2_PD(Y,F);
1881             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1882             H                = _mm_setzero_pd();
1883             GMX_MM_TRANSPOSE2_PD(G,H);
1884             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1885             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1886             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1887
1888             /* LENNARD-JONES DISPERSION/REPULSION */
1889
1890             rinvsix          = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1891             fvdw             = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1892
1893             fscal            = _mm_add_pd(felec,fvdw);
1894
1895             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1896
1897             /* Update vectorial force */
1898             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1899             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1900             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1901             
1902             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1903             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1904             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1905
1906             /**************************
1907              * CALCULATE INTERACTIONS *
1908              **************************/
1909
1910             r01              = _mm_mul_pd(rsq01,rinv01);
1911
1912             /* Calculate table index by multiplying r with table scale and truncate to integer */
1913             rt               = _mm_mul_pd(r01,vftabscale);
1914             vfitab           = _mm_cvttpd_epi32(rt);
1915 #ifdef __XOP__
1916             vfeps            = _mm_frcz_pd(rt);
1917 #else
1918             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1919 #endif
1920             twovfeps         = _mm_add_pd(vfeps,vfeps);
1921             vfitab           = _mm_slli_epi32(vfitab,2);
1922
1923             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1924             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1925             F                = _mm_setzero_pd();
1926             GMX_MM_TRANSPOSE2_PD(Y,F);
1927             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1928             H                = _mm_setzero_pd();
1929             GMX_MM_TRANSPOSE2_PD(G,H);
1930             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1931             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1932             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1933
1934             fscal            = felec;
1935
1936             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1937
1938             /* Update vectorial force */
1939             fix0             = _mm_macc_pd(dx01,fscal,fix0);
1940             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
1941             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
1942             
1943             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
1944             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
1945             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
1946
1947             /**************************
1948              * CALCULATE INTERACTIONS *
1949              **************************/
1950
1951             r02              = _mm_mul_pd(rsq02,rinv02);
1952
1953             /* Calculate table index by multiplying r with table scale and truncate to integer */
1954             rt               = _mm_mul_pd(r02,vftabscale);
1955             vfitab           = _mm_cvttpd_epi32(rt);
1956 #ifdef __XOP__
1957             vfeps            = _mm_frcz_pd(rt);
1958 #else
1959             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1960 #endif
1961             twovfeps         = _mm_add_pd(vfeps,vfeps);
1962             vfitab           = _mm_slli_epi32(vfitab,2);
1963
1964             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1965             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1966             F                = _mm_setzero_pd();
1967             GMX_MM_TRANSPOSE2_PD(Y,F);
1968             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1969             H                = _mm_setzero_pd();
1970             GMX_MM_TRANSPOSE2_PD(G,H);
1971             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1972             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1973             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1974
1975             fscal            = felec;
1976
1977             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1978
1979             /* Update vectorial force */
1980             fix0             = _mm_macc_pd(dx02,fscal,fix0);
1981             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
1982             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
1983             
1984             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
1985             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
1986             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
1987
1988             /**************************
1989              * CALCULATE INTERACTIONS *
1990              **************************/
1991
1992             r10              = _mm_mul_pd(rsq10,rinv10);
1993
1994             /* Calculate table index by multiplying r with table scale and truncate to integer */
1995             rt               = _mm_mul_pd(r10,vftabscale);
1996             vfitab           = _mm_cvttpd_epi32(rt);
1997 #ifdef __XOP__
1998             vfeps            = _mm_frcz_pd(rt);
1999 #else
2000             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2001 #endif
2002             twovfeps         = _mm_add_pd(vfeps,vfeps);
2003             vfitab           = _mm_slli_epi32(vfitab,2);
2004
2005             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2006             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2007             F                = _mm_setzero_pd();
2008             GMX_MM_TRANSPOSE2_PD(Y,F);
2009             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2010             H                = _mm_setzero_pd();
2011             GMX_MM_TRANSPOSE2_PD(G,H);
2012             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2013             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2014             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
2015
2016             fscal            = felec;
2017
2018             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2019
2020             /* Update vectorial force */
2021             fix1             = _mm_macc_pd(dx10,fscal,fix1);
2022             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
2023             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
2024             
2025             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
2026             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
2027             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
2028
2029             /**************************
2030              * CALCULATE INTERACTIONS *
2031              **************************/
2032
2033             r11              = _mm_mul_pd(rsq11,rinv11);
2034
2035             /* Calculate table index by multiplying r with table scale and truncate to integer */
2036             rt               = _mm_mul_pd(r11,vftabscale);
2037             vfitab           = _mm_cvttpd_epi32(rt);
2038 #ifdef __XOP__
2039             vfeps            = _mm_frcz_pd(rt);
2040 #else
2041             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2042 #endif
2043             twovfeps         = _mm_add_pd(vfeps,vfeps);
2044             vfitab           = _mm_slli_epi32(vfitab,2);
2045
2046             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2047             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2048             F                = _mm_setzero_pd();
2049             GMX_MM_TRANSPOSE2_PD(Y,F);
2050             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2051             H                = _mm_setzero_pd();
2052             GMX_MM_TRANSPOSE2_PD(G,H);
2053             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2054             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2055             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2056
2057             fscal            = felec;
2058
2059             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2060
2061             /* Update vectorial force */
2062             fix1             = _mm_macc_pd(dx11,fscal,fix1);
2063             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
2064             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
2065             
2066             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
2067             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
2068             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
2069
2070             /**************************
2071              * CALCULATE INTERACTIONS *
2072              **************************/
2073
2074             r12              = _mm_mul_pd(rsq12,rinv12);
2075
2076             /* Calculate table index by multiplying r with table scale and truncate to integer */
2077             rt               = _mm_mul_pd(r12,vftabscale);
2078             vfitab           = _mm_cvttpd_epi32(rt);
2079 #ifdef __XOP__
2080             vfeps            = _mm_frcz_pd(rt);
2081 #else
2082             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2083 #endif
2084             twovfeps         = _mm_add_pd(vfeps,vfeps);
2085             vfitab           = _mm_slli_epi32(vfitab,2);
2086
2087             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2088             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2089             F                = _mm_setzero_pd();
2090             GMX_MM_TRANSPOSE2_PD(Y,F);
2091             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2092             H                = _mm_setzero_pd();
2093             GMX_MM_TRANSPOSE2_PD(G,H);
2094             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2095             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2096             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2097
2098             fscal            = felec;
2099
2100             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2101
2102             /* Update vectorial force */
2103             fix1             = _mm_macc_pd(dx12,fscal,fix1);
2104             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
2105             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
2106             
2107             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
2108             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
2109             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
2110
2111             /**************************
2112              * CALCULATE INTERACTIONS *
2113              **************************/
2114
2115             r20              = _mm_mul_pd(rsq20,rinv20);
2116
2117             /* Calculate table index by multiplying r with table scale and truncate to integer */
2118             rt               = _mm_mul_pd(r20,vftabscale);
2119             vfitab           = _mm_cvttpd_epi32(rt);
2120 #ifdef __XOP__
2121             vfeps            = _mm_frcz_pd(rt);
2122 #else
2123             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2124 #endif
2125             twovfeps         = _mm_add_pd(vfeps,vfeps);
2126             vfitab           = _mm_slli_epi32(vfitab,2);
2127
2128             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2129             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2130             F                = _mm_setzero_pd();
2131             GMX_MM_TRANSPOSE2_PD(Y,F);
2132             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2133             H                = _mm_setzero_pd();
2134             GMX_MM_TRANSPOSE2_PD(G,H);
2135             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2136             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2137             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2138
2139             fscal            = felec;
2140
2141             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2142
2143             /* Update vectorial force */
2144             fix2             = _mm_macc_pd(dx20,fscal,fix2);
2145             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
2146             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
2147             
2148             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
2149             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
2150             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
2151
2152             /**************************
2153              * CALCULATE INTERACTIONS *
2154              **************************/
2155
2156             r21              = _mm_mul_pd(rsq21,rinv21);
2157
2158             /* Calculate table index by multiplying r with table scale and truncate to integer */
2159             rt               = _mm_mul_pd(r21,vftabscale);
2160             vfitab           = _mm_cvttpd_epi32(rt);
2161 #ifdef __XOP__
2162             vfeps            = _mm_frcz_pd(rt);
2163 #else
2164             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2165 #endif
2166             twovfeps         = _mm_add_pd(vfeps,vfeps);
2167             vfitab           = _mm_slli_epi32(vfitab,2);
2168
2169             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2170             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2171             F                = _mm_setzero_pd();
2172             GMX_MM_TRANSPOSE2_PD(Y,F);
2173             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2174             H                = _mm_setzero_pd();
2175             GMX_MM_TRANSPOSE2_PD(G,H);
2176             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2177             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2178             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2179
2180             fscal            = felec;
2181
2182             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2183
2184             /* Update vectorial force */
2185             fix2             = _mm_macc_pd(dx21,fscal,fix2);
2186             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
2187             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
2188             
2189             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
2190             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
2191             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
2192
2193             /**************************
2194              * CALCULATE INTERACTIONS *
2195              **************************/
2196
2197             r22              = _mm_mul_pd(rsq22,rinv22);
2198
2199             /* Calculate table index by multiplying r with table scale and truncate to integer */
2200             rt               = _mm_mul_pd(r22,vftabscale);
2201             vfitab           = _mm_cvttpd_epi32(rt);
2202 #ifdef __XOP__
2203             vfeps            = _mm_frcz_pd(rt);
2204 #else
2205             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2206 #endif
2207             twovfeps         = _mm_add_pd(vfeps,vfeps);
2208             vfitab           = _mm_slli_epi32(vfitab,2);
2209
2210             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2211             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2212             F                = _mm_setzero_pd();
2213             GMX_MM_TRANSPOSE2_PD(Y,F);
2214             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2215             H                = _mm_setzero_pd();
2216             GMX_MM_TRANSPOSE2_PD(G,H);
2217             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2218             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2219             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2220
2221             fscal            = felec;
2222
2223             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2224
2225             /* Update vectorial force */
2226             fix2             = _mm_macc_pd(dx22,fscal,fix2);
2227             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
2228             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
2229             
2230             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
2231             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
2232             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
2233
2234             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2235
2236             /* Inner loop uses 386 flops */
2237         }
2238
2239         /* End of innermost loop */
2240
2241         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2242                                               f+i_coord_offset,fshift+i_shift_offset);
2243
2244         /* Increment number of inner iterations */
2245         inneriter                  += j_index_end - j_index_start;
2246
2247         /* Outer loop uses 18 flops */
2248     }
2249
2250     /* Increment number of outer iterations */
2251     outeriter        += nri;
2252
2253     /* Update outer/inner flops */
2254
2255     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*386);
2256 }