4d2c8920fa74836d7d0b05467ae90a45132a0b05
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_double / nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_sse2_double.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_double kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_double.h"
34 #include "kernelutil_x86_sse2_double.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_double
38  * Electrostatics interaction: CubicSplineTable
39  * VdW interaction:            CubicSplineTable
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_double
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB;
61     int              j_coord_offsetA,j_coord_offsetB;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwjidx0A,vdwjidx0B;
73     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
74     int              vdwjidx1A,vdwjidx1B;
75     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
76     int              vdwjidx2A,vdwjidx2B;
77     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
78     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
80     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
81     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
83     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
84     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
86     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
87     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
88     real             *charge;
89     int              nvdwtype;
90     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
91     int              *vdwtype;
92     real             *vdwparam;
93     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
94     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
95     __m128i          vfitab;
96     __m128i          ifour       = _mm_set1_epi32(4);
97     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
98     real             *vftab;
99     __m128d          dummy_mask,cutoff_mask;
100     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
101     __m128d          one     = _mm_set1_pd(1.0);
102     __m128d          two     = _mm_set1_pd(2.0);
103     x                = xx[0];
104     f                = ff[0];
105
106     nri              = nlist->nri;
107     iinr             = nlist->iinr;
108     jindex           = nlist->jindex;
109     jjnr             = nlist->jjnr;
110     shiftidx         = nlist->shift;
111     gid              = nlist->gid;
112     shiftvec         = fr->shift_vec[0];
113     fshift           = fr->fshift[0];
114     facel            = _mm_set1_pd(fr->epsfac);
115     charge           = mdatoms->chargeA;
116     nvdwtype         = fr->ntype;
117     vdwparam         = fr->nbfp;
118     vdwtype          = mdatoms->typeA;
119
120     vftab            = kernel_data->table_elec_vdw->data;
121     vftabscale       = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
122
123     /* Setup water-specific parameters */
124     inr              = nlist->iinr[0];
125     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
126     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
127     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
128     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
129
130     jq0              = _mm_set1_pd(charge[inr+0]);
131     jq1              = _mm_set1_pd(charge[inr+1]);
132     jq2              = _mm_set1_pd(charge[inr+2]);
133     vdwjidx0A        = 2*vdwtype[inr+0];
134     qq00             = _mm_mul_pd(iq0,jq0);
135     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
136     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
137     qq01             = _mm_mul_pd(iq0,jq1);
138     qq02             = _mm_mul_pd(iq0,jq2);
139     qq10             = _mm_mul_pd(iq1,jq0);
140     qq11             = _mm_mul_pd(iq1,jq1);
141     qq12             = _mm_mul_pd(iq1,jq2);
142     qq20             = _mm_mul_pd(iq2,jq0);
143     qq21             = _mm_mul_pd(iq2,jq1);
144     qq22             = _mm_mul_pd(iq2,jq2);
145
146     /* Avoid stupid compiler warnings */
147     jnrA = jnrB = 0;
148     j_coord_offsetA = 0;
149     j_coord_offsetB = 0;
150
151     outeriter        = 0;
152     inneriter        = 0;
153
154     /* Start outer loop over neighborlists */
155     for(iidx=0; iidx<nri; iidx++)
156     {
157         /* Load shift vector for this list */
158         i_shift_offset   = DIM*shiftidx[iidx];
159
160         /* Load limits for loop over neighbors */
161         j_index_start    = jindex[iidx];
162         j_index_end      = jindex[iidx+1];
163
164         /* Get outer coordinate index */
165         inr              = iinr[iidx];
166         i_coord_offset   = DIM*inr;
167
168         /* Load i particle coords and add shift vector */
169         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
170                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
171
172         fix0             = _mm_setzero_pd();
173         fiy0             = _mm_setzero_pd();
174         fiz0             = _mm_setzero_pd();
175         fix1             = _mm_setzero_pd();
176         fiy1             = _mm_setzero_pd();
177         fiz1             = _mm_setzero_pd();
178         fix2             = _mm_setzero_pd();
179         fiy2             = _mm_setzero_pd();
180         fiz2             = _mm_setzero_pd();
181
182         /* Reset potential sums */
183         velecsum         = _mm_setzero_pd();
184         vvdwsum          = _mm_setzero_pd();
185
186         /* Start inner kernel loop */
187         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
188         {
189
190             /* Get j neighbor index, and coordinate index */
191             jnrA             = jjnr[jidx];
192             jnrB             = jjnr[jidx+1];
193             j_coord_offsetA  = DIM*jnrA;
194             j_coord_offsetB  = DIM*jnrB;
195             
196             /* load j atom coordinates */
197             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
198                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
199             
200             /* Calculate displacement vector */
201             dx00             = _mm_sub_pd(ix0,jx0);
202             dy00             = _mm_sub_pd(iy0,jy0);
203             dz00             = _mm_sub_pd(iz0,jz0);
204             dx01             = _mm_sub_pd(ix0,jx1);
205             dy01             = _mm_sub_pd(iy0,jy1);
206             dz01             = _mm_sub_pd(iz0,jz1);
207             dx02             = _mm_sub_pd(ix0,jx2);
208             dy02             = _mm_sub_pd(iy0,jy2);
209             dz02             = _mm_sub_pd(iz0,jz2);
210             dx10             = _mm_sub_pd(ix1,jx0);
211             dy10             = _mm_sub_pd(iy1,jy0);
212             dz10             = _mm_sub_pd(iz1,jz0);
213             dx11             = _mm_sub_pd(ix1,jx1);
214             dy11             = _mm_sub_pd(iy1,jy1);
215             dz11             = _mm_sub_pd(iz1,jz1);
216             dx12             = _mm_sub_pd(ix1,jx2);
217             dy12             = _mm_sub_pd(iy1,jy2);
218             dz12             = _mm_sub_pd(iz1,jz2);
219             dx20             = _mm_sub_pd(ix2,jx0);
220             dy20             = _mm_sub_pd(iy2,jy0);
221             dz20             = _mm_sub_pd(iz2,jz0);
222             dx21             = _mm_sub_pd(ix2,jx1);
223             dy21             = _mm_sub_pd(iy2,jy1);
224             dz21             = _mm_sub_pd(iz2,jz1);
225             dx22             = _mm_sub_pd(ix2,jx2);
226             dy22             = _mm_sub_pd(iy2,jy2);
227             dz22             = _mm_sub_pd(iz2,jz2);
228
229             /* Calculate squared distance and things based on it */
230             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
231             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
232             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
233             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
234             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
235             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
236             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
237             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
238             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
239
240             rinv00           = gmx_mm_invsqrt_pd(rsq00);
241             rinv01           = gmx_mm_invsqrt_pd(rsq01);
242             rinv02           = gmx_mm_invsqrt_pd(rsq02);
243             rinv10           = gmx_mm_invsqrt_pd(rsq10);
244             rinv11           = gmx_mm_invsqrt_pd(rsq11);
245             rinv12           = gmx_mm_invsqrt_pd(rsq12);
246             rinv20           = gmx_mm_invsqrt_pd(rsq20);
247             rinv21           = gmx_mm_invsqrt_pd(rsq21);
248             rinv22           = gmx_mm_invsqrt_pd(rsq22);
249
250             fjx0             = _mm_setzero_pd();
251             fjy0             = _mm_setzero_pd();
252             fjz0             = _mm_setzero_pd();
253             fjx1             = _mm_setzero_pd();
254             fjy1             = _mm_setzero_pd();
255             fjz1             = _mm_setzero_pd();
256             fjx2             = _mm_setzero_pd();
257             fjy2             = _mm_setzero_pd();
258             fjz2             = _mm_setzero_pd();
259
260             /**************************
261              * CALCULATE INTERACTIONS *
262              **************************/
263
264             r00              = _mm_mul_pd(rsq00,rinv00);
265
266             /* Calculate table index by multiplying r with table scale and truncate to integer */
267             rt               = _mm_mul_pd(r00,vftabscale);
268             vfitab           = _mm_cvttpd_epi32(rt);
269             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
270             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
271
272             /* CUBIC SPLINE TABLE ELECTROSTATICS */
273             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
274             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
275             GMX_MM_TRANSPOSE2_PD(Y,F);
276             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
277             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
278             GMX_MM_TRANSPOSE2_PD(G,H);
279             Heps             = _mm_mul_pd(vfeps,H);
280             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
281             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
282             velec            = _mm_mul_pd(qq00,VV);
283             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
284             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
285
286             /* CUBIC SPLINE TABLE DISPERSION */
287             vfitab           = _mm_add_epi32(vfitab,ifour);
288             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
289             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
290             GMX_MM_TRANSPOSE2_PD(Y,F);
291             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
292             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
293             GMX_MM_TRANSPOSE2_PD(G,H);
294             Heps             = _mm_mul_pd(vfeps,H);
295             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
296             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
297             vvdw6            = _mm_mul_pd(c6_00,VV);
298             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
299             fvdw6            = _mm_mul_pd(c6_00,FF);
300
301             /* CUBIC SPLINE TABLE REPULSION */
302             vfitab           = _mm_add_epi32(vfitab,ifour);
303             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
304             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
305             GMX_MM_TRANSPOSE2_PD(Y,F);
306             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
307             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
308             GMX_MM_TRANSPOSE2_PD(G,H);
309             Heps             = _mm_mul_pd(vfeps,H);
310             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
311             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
312             vvdw12           = _mm_mul_pd(c12_00,VV);
313             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
314             fvdw12           = _mm_mul_pd(c12_00,FF);
315             vvdw             = _mm_add_pd(vvdw12,vvdw6);
316             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
317
318             /* Update potential sum for this i atom from the interaction with this j atom. */
319             velecsum         = _mm_add_pd(velecsum,velec);
320             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
321
322             fscal            = _mm_add_pd(felec,fvdw);
323
324             /* Calculate temporary vectorial force */
325             tx               = _mm_mul_pd(fscal,dx00);
326             ty               = _mm_mul_pd(fscal,dy00);
327             tz               = _mm_mul_pd(fscal,dz00);
328
329             /* Update vectorial force */
330             fix0             = _mm_add_pd(fix0,tx);
331             fiy0             = _mm_add_pd(fiy0,ty);
332             fiz0             = _mm_add_pd(fiz0,tz);
333
334             fjx0             = _mm_add_pd(fjx0,tx);
335             fjy0             = _mm_add_pd(fjy0,ty);
336             fjz0             = _mm_add_pd(fjz0,tz);
337
338             /**************************
339              * CALCULATE INTERACTIONS *
340              **************************/
341
342             r01              = _mm_mul_pd(rsq01,rinv01);
343
344             /* Calculate table index by multiplying r with table scale and truncate to integer */
345             rt               = _mm_mul_pd(r01,vftabscale);
346             vfitab           = _mm_cvttpd_epi32(rt);
347             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
348             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
349
350             /* CUBIC SPLINE TABLE ELECTROSTATICS */
351             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
352             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
353             GMX_MM_TRANSPOSE2_PD(Y,F);
354             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
355             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
356             GMX_MM_TRANSPOSE2_PD(G,H);
357             Heps             = _mm_mul_pd(vfeps,H);
358             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
359             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
360             velec            = _mm_mul_pd(qq01,VV);
361             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
362             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
363
364             /* Update potential sum for this i atom from the interaction with this j atom. */
365             velecsum         = _mm_add_pd(velecsum,velec);
366
367             fscal            = felec;
368
369             /* Calculate temporary vectorial force */
370             tx               = _mm_mul_pd(fscal,dx01);
371             ty               = _mm_mul_pd(fscal,dy01);
372             tz               = _mm_mul_pd(fscal,dz01);
373
374             /* Update vectorial force */
375             fix0             = _mm_add_pd(fix0,tx);
376             fiy0             = _mm_add_pd(fiy0,ty);
377             fiz0             = _mm_add_pd(fiz0,tz);
378
379             fjx1             = _mm_add_pd(fjx1,tx);
380             fjy1             = _mm_add_pd(fjy1,ty);
381             fjz1             = _mm_add_pd(fjz1,tz);
382
383             /**************************
384              * CALCULATE INTERACTIONS *
385              **************************/
386
387             r02              = _mm_mul_pd(rsq02,rinv02);
388
389             /* Calculate table index by multiplying r with table scale and truncate to integer */
390             rt               = _mm_mul_pd(r02,vftabscale);
391             vfitab           = _mm_cvttpd_epi32(rt);
392             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
393             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
394
395             /* CUBIC SPLINE TABLE ELECTROSTATICS */
396             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
397             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
398             GMX_MM_TRANSPOSE2_PD(Y,F);
399             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
400             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
401             GMX_MM_TRANSPOSE2_PD(G,H);
402             Heps             = _mm_mul_pd(vfeps,H);
403             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
404             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
405             velec            = _mm_mul_pd(qq02,VV);
406             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
407             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
408
409             /* Update potential sum for this i atom from the interaction with this j atom. */
410             velecsum         = _mm_add_pd(velecsum,velec);
411
412             fscal            = felec;
413
414             /* Calculate temporary vectorial force */
415             tx               = _mm_mul_pd(fscal,dx02);
416             ty               = _mm_mul_pd(fscal,dy02);
417             tz               = _mm_mul_pd(fscal,dz02);
418
419             /* Update vectorial force */
420             fix0             = _mm_add_pd(fix0,tx);
421             fiy0             = _mm_add_pd(fiy0,ty);
422             fiz0             = _mm_add_pd(fiz0,tz);
423
424             fjx2             = _mm_add_pd(fjx2,tx);
425             fjy2             = _mm_add_pd(fjy2,ty);
426             fjz2             = _mm_add_pd(fjz2,tz);
427
428             /**************************
429              * CALCULATE INTERACTIONS *
430              **************************/
431
432             r10              = _mm_mul_pd(rsq10,rinv10);
433
434             /* Calculate table index by multiplying r with table scale and truncate to integer */
435             rt               = _mm_mul_pd(r10,vftabscale);
436             vfitab           = _mm_cvttpd_epi32(rt);
437             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
438             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
439
440             /* CUBIC SPLINE TABLE ELECTROSTATICS */
441             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
442             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
443             GMX_MM_TRANSPOSE2_PD(Y,F);
444             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
445             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
446             GMX_MM_TRANSPOSE2_PD(G,H);
447             Heps             = _mm_mul_pd(vfeps,H);
448             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
449             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
450             velec            = _mm_mul_pd(qq10,VV);
451             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
452             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
453
454             /* Update potential sum for this i atom from the interaction with this j atom. */
455             velecsum         = _mm_add_pd(velecsum,velec);
456
457             fscal            = felec;
458
459             /* Calculate temporary vectorial force */
460             tx               = _mm_mul_pd(fscal,dx10);
461             ty               = _mm_mul_pd(fscal,dy10);
462             tz               = _mm_mul_pd(fscal,dz10);
463
464             /* Update vectorial force */
465             fix1             = _mm_add_pd(fix1,tx);
466             fiy1             = _mm_add_pd(fiy1,ty);
467             fiz1             = _mm_add_pd(fiz1,tz);
468
469             fjx0             = _mm_add_pd(fjx0,tx);
470             fjy0             = _mm_add_pd(fjy0,ty);
471             fjz0             = _mm_add_pd(fjz0,tz);
472
473             /**************************
474              * CALCULATE INTERACTIONS *
475              **************************/
476
477             r11              = _mm_mul_pd(rsq11,rinv11);
478
479             /* Calculate table index by multiplying r with table scale and truncate to integer */
480             rt               = _mm_mul_pd(r11,vftabscale);
481             vfitab           = _mm_cvttpd_epi32(rt);
482             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
483             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
484
485             /* CUBIC SPLINE TABLE ELECTROSTATICS */
486             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
487             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
488             GMX_MM_TRANSPOSE2_PD(Y,F);
489             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
490             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
491             GMX_MM_TRANSPOSE2_PD(G,H);
492             Heps             = _mm_mul_pd(vfeps,H);
493             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
494             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
495             velec            = _mm_mul_pd(qq11,VV);
496             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
497             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
498
499             /* Update potential sum for this i atom from the interaction with this j atom. */
500             velecsum         = _mm_add_pd(velecsum,velec);
501
502             fscal            = felec;
503
504             /* Calculate temporary vectorial force */
505             tx               = _mm_mul_pd(fscal,dx11);
506             ty               = _mm_mul_pd(fscal,dy11);
507             tz               = _mm_mul_pd(fscal,dz11);
508
509             /* Update vectorial force */
510             fix1             = _mm_add_pd(fix1,tx);
511             fiy1             = _mm_add_pd(fiy1,ty);
512             fiz1             = _mm_add_pd(fiz1,tz);
513
514             fjx1             = _mm_add_pd(fjx1,tx);
515             fjy1             = _mm_add_pd(fjy1,ty);
516             fjz1             = _mm_add_pd(fjz1,tz);
517
518             /**************************
519              * CALCULATE INTERACTIONS *
520              **************************/
521
522             r12              = _mm_mul_pd(rsq12,rinv12);
523
524             /* Calculate table index by multiplying r with table scale and truncate to integer */
525             rt               = _mm_mul_pd(r12,vftabscale);
526             vfitab           = _mm_cvttpd_epi32(rt);
527             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
528             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
529
530             /* CUBIC SPLINE TABLE ELECTROSTATICS */
531             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
532             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
533             GMX_MM_TRANSPOSE2_PD(Y,F);
534             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
535             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
536             GMX_MM_TRANSPOSE2_PD(G,H);
537             Heps             = _mm_mul_pd(vfeps,H);
538             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
539             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
540             velec            = _mm_mul_pd(qq12,VV);
541             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
542             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
543
544             /* Update potential sum for this i atom from the interaction with this j atom. */
545             velecsum         = _mm_add_pd(velecsum,velec);
546
547             fscal            = felec;
548
549             /* Calculate temporary vectorial force */
550             tx               = _mm_mul_pd(fscal,dx12);
551             ty               = _mm_mul_pd(fscal,dy12);
552             tz               = _mm_mul_pd(fscal,dz12);
553
554             /* Update vectorial force */
555             fix1             = _mm_add_pd(fix1,tx);
556             fiy1             = _mm_add_pd(fiy1,ty);
557             fiz1             = _mm_add_pd(fiz1,tz);
558
559             fjx2             = _mm_add_pd(fjx2,tx);
560             fjy2             = _mm_add_pd(fjy2,ty);
561             fjz2             = _mm_add_pd(fjz2,tz);
562
563             /**************************
564              * CALCULATE INTERACTIONS *
565              **************************/
566
567             r20              = _mm_mul_pd(rsq20,rinv20);
568
569             /* Calculate table index by multiplying r with table scale and truncate to integer */
570             rt               = _mm_mul_pd(r20,vftabscale);
571             vfitab           = _mm_cvttpd_epi32(rt);
572             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
573             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
574
575             /* CUBIC SPLINE TABLE ELECTROSTATICS */
576             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
577             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
578             GMX_MM_TRANSPOSE2_PD(Y,F);
579             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
580             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
581             GMX_MM_TRANSPOSE2_PD(G,H);
582             Heps             = _mm_mul_pd(vfeps,H);
583             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
584             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
585             velec            = _mm_mul_pd(qq20,VV);
586             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
587             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
588
589             /* Update potential sum for this i atom from the interaction with this j atom. */
590             velecsum         = _mm_add_pd(velecsum,velec);
591
592             fscal            = felec;
593
594             /* Calculate temporary vectorial force */
595             tx               = _mm_mul_pd(fscal,dx20);
596             ty               = _mm_mul_pd(fscal,dy20);
597             tz               = _mm_mul_pd(fscal,dz20);
598
599             /* Update vectorial force */
600             fix2             = _mm_add_pd(fix2,tx);
601             fiy2             = _mm_add_pd(fiy2,ty);
602             fiz2             = _mm_add_pd(fiz2,tz);
603
604             fjx0             = _mm_add_pd(fjx0,tx);
605             fjy0             = _mm_add_pd(fjy0,ty);
606             fjz0             = _mm_add_pd(fjz0,tz);
607
608             /**************************
609              * CALCULATE INTERACTIONS *
610              **************************/
611
612             r21              = _mm_mul_pd(rsq21,rinv21);
613
614             /* Calculate table index by multiplying r with table scale and truncate to integer */
615             rt               = _mm_mul_pd(r21,vftabscale);
616             vfitab           = _mm_cvttpd_epi32(rt);
617             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
618             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
619
620             /* CUBIC SPLINE TABLE ELECTROSTATICS */
621             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
622             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
623             GMX_MM_TRANSPOSE2_PD(Y,F);
624             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
625             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
626             GMX_MM_TRANSPOSE2_PD(G,H);
627             Heps             = _mm_mul_pd(vfeps,H);
628             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
629             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
630             velec            = _mm_mul_pd(qq21,VV);
631             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
632             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
633
634             /* Update potential sum for this i atom from the interaction with this j atom. */
635             velecsum         = _mm_add_pd(velecsum,velec);
636
637             fscal            = felec;
638
639             /* Calculate temporary vectorial force */
640             tx               = _mm_mul_pd(fscal,dx21);
641             ty               = _mm_mul_pd(fscal,dy21);
642             tz               = _mm_mul_pd(fscal,dz21);
643
644             /* Update vectorial force */
645             fix2             = _mm_add_pd(fix2,tx);
646             fiy2             = _mm_add_pd(fiy2,ty);
647             fiz2             = _mm_add_pd(fiz2,tz);
648
649             fjx1             = _mm_add_pd(fjx1,tx);
650             fjy1             = _mm_add_pd(fjy1,ty);
651             fjz1             = _mm_add_pd(fjz1,tz);
652
653             /**************************
654              * CALCULATE INTERACTIONS *
655              **************************/
656
657             r22              = _mm_mul_pd(rsq22,rinv22);
658
659             /* Calculate table index by multiplying r with table scale and truncate to integer */
660             rt               = _mm_mul_pd(r22,vftabscale);
661             vfitab           = _mm_cvttpd_epi32(rt);
662             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
663             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
664
665             /* CUBIC SPLINE TABLE ELECTROSTATICS */
666             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
667             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
668             GMX_MM_TRANSPOSE2_PD(Y,F);
669             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
670             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
671             GMX_MM_TRANSPOSE2_PD(G,H);
672             Heps             = _mm_mul_pd(vfeps,H);
673             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
674             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
675             velec            = _mm_mul_pd(qq22,VV);
676             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
677             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
678
679             /* Update potential sum for this i atom from the interaction with this j atom. */
680             velecsum         = _mm_add_pd(velecsum,velec);
681
682             fscal            = felec;
683
684             /* Calculate temporary vectorial force */
685             tx               = _mm_mul_pd(fscal,dx22);
686             ty               = _mm_mul_pd(fscal,dy22);
687             tz               = _mm_mul_pd(fscal,dz22);
688
689             /* Update vectorial force */
690             fix2             = _mm_add_pd(fix2,tx);
691             fiy2             = _mm_add_pd(fiy2,ty);
692             fiz2             = _mm_add_pd(fiz2,tz);
693
694             fjx2             = _mm_add_pd(fjx2,tx);
695             fjy2             = _mm_add_pd(fjy2,ty);
696             fjz2             = _mm_add_pd(fjz2,tz);
697
698             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
699
700             /* Inner loop uses 417 flops */
701         }
702
703         if(jidx<j_index_end)
704         {
705
706             jnrA             = jjnr[jidx];
707             j_coord_offsetA  = DIM*jnrA;
708
709             /* load j atom coordinates */
710             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
711                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
712             
713             /* Calculate displacement vector */
714             dx00             = _mm_sub_pd(ix0,jx0);
715             dy00             = _mm_sub_pd(iy0,jy0);
716             dz00             = _mm_sub_pd(iz0,jz0);
717             dx01             = _mm_sub_pd(ix0,jx1);
718             dy01             = _mm_sub_pd(iy0,jy1);
719             dz01             = _mm_sub_pd(iz0,jz1);
720             dx02             = _mm_sub_pd(ix0,jx2);
721             dy02             = _mm_sub_pd(iy0,jy2);
722             dz02             = _mm_sub_pd(iz0,jz2);
723             dx10             = _mm_sub_pd(ix1,jx0);
724             dy10             = _mm_sub_pd(iy1,jy0);
725             dz10             = _mm_sub_pd(iz1,jz0);
726             dx11             = _mm_sub_pd(ix1,jx1);
727             dy11             = _mm_sub_pd(iy1,jy1);
728             dz11             = _mm_sub_pd(iz1,jz1);
729             dx12             = _mm_sub_pd(ix1,jx2);
730             dy12             = _mm_sub_pd(iy1,jy2);
731             dz12             = _mm_sub_pd(iz1,jz2);
732             dx20             = _mm_sub_pd(ix2,jx0);
733             dy20             = _mm_sub_pd(iy2,jy0);
734             dz20             = _mm_sub_pd(iz2,jz0);
735             dx21             = _mm_sub_pd(ix2,jx1);
736             dy21             = _mm_sub_pd(iy2,jy1);
737             dz21             = _mm_sub_pd(iz2,jz1);
738             dx22             = _mm_sub_pd(ix2,jx2);
739             dy22             = _mm_sub_pd(iy2,jy2);
740             dz22             = _mm_sub_pd(iz2,jz2);
741
742             /* Calculate squared distance and things based on it */
743             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
744             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
745             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
746             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
747             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
748             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
749             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
750             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
751             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
752
753             rinv00           = gmx_mm_invsqrt_pd(rsq00);
754             rinv01           = gmx_mm_invsqrt_pd(rsq01);
755             rinv02           = gmx_mm_invsqrt_pd(rsq02);
756             rinv10           = gmx_mm_invsqrt_pd(rsq10);
757             rinv11           = gmx_mm_invsqrt_pd(rsq11);
758             rinv12           = gmx_mm_invsqrt_pd(rsq12);
759             rinv20           = gmx_mm_invsqrt_pd(rsq20);
760             rinv21           = gmx_mm_invsqrt_pd(rsq21);
761             rinv22           = gmx_mm_invsqrt_pd(rsq22);
762
763             fjx0             = _mm_setzero_pd();
764             fjy0             = _mm_setzero_pd();
765             fjz0             = _mm_setzero_pd();
766             fjx1             = _mm_setzero_pd();
767             fjy1             = _mm_setzero_pd();
768             fjz1             = _mm_setzero_pd();
769             fjx2             = _mm_setzero_pd();
770             fjy2             = _mm_setzero_pd();
771             fjz2             = _mm_setzero_pd();
772
773             /**************************
774              * CALCULATE INTERACTIONS *
775              **************************/
776
777             r00              = _mm_mul_pd(rsq00,rinv00);
778
779             /* Calculate table index by multiplying r with table scale and truncate to integer */
780             rt               = _mm_mul_pd(r00,vftabscale);
781             vfitab           = _mm_cvttpd_epi32(rt);
782             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
783             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
784
785             /* CUBIC SPLINE TABLE ELECTROSTATICS */
786             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
787             F                = _mm_setzero_pd();
788             GMX_MM_TRANSPOSE2_PD(Y,F);
789             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
790             H                = _mm_setzero_pd();
791             GMX_MM_TRANSPOSE2_PD(G,H);
792             Heps             = _mm_mul_pd(vfeps,H);
793             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
794             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
795             velec            = _mm_mul_pd(qq00,VV);
796             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
797             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
798
799             /* CUBIC SPLINE TABLE DISPERSION */
800             vfitab           = _mm_add_epi32(vfitab,ifour);
801             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
802             F                = _mm_setzero_pd();
803             GMX_MM_TRANSPOSE2_PD(Y,F);
804             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
805             H                = _mm_setzero_pd();
806             GMX_MM_TRANSPOSE2_PD(G,H);
807             Heps             = _mm_mul_pd(vfeps,H);
808             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
809             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
810             vvdw6            = _mm_mul_pd(c6_00,VV);
811             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
812             fvdw6            = _mm_mul_pd(c6_00,FF);
813
814             /* CUBIC SPLINE TABLE REPULSION */
815             vfitab           = _mm_add_epi32(vfitab,ifour);
816             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
817             F                = _mm_setzero_pd();
818             GMX_MM_TRANSPOSE2_PD(Y,F);
819             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
820             H                = _mm_setzero_pd();
821             GMX_MM_TRANSPOSE2_PD(G,H);
822             Heps             = _mm_mul_pd(vfeps,H);
823             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
824             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
825             vvdw12           = _mm_mul_pd(c12_00,VV);
826             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
827             fvdw12           = _mm_mul_pd(c12_00,FF);
828             vvdw             = _mm_add_pd(vvdw12,vvdw6);
829             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
830
831             /* Update potential sum for this i atom from the interaction with this j atom. */
832             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
833             velecsum         = _mm_add_pd(velecsum,velec);
834             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
835             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
836
837             fscal            = _mm_add_pd(felec,fvdw);
838
839             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
840
841             /* Calculate temporary vectorial force */
842             tx               = _mm_mul_pd(fscal,dx00);
843             ty               = _mm_mul_pd(fscal,dy00);
844             tz               = _mm_mul_pd(fscal,dz00);
845
846             /* Update vectorial force */
847             fix0             = _mm_add_pd(fix0,tx);
848             fiy0             = _mm_add_pd(fiy0,ty);
849             fiz0             = _mm_add_pd(fiz0,tz);
850
851             fjx0             = _mm_add_pd(fjx0,tx);
852             fjy0             = _mm_add_pd(fjy0,ty);
853             fjz0             = _mm_add_pd(fjz0,tz);
854
855             /**************************
856              * CALCULATE INTERACTIONS *
857              **************************/
858
859             r01              = _mm_mul_pd(rsq01,rinv01);
860
861             /* Calculate table index by multiplying r with table scale and truncate to integer */
862             rt               = _mm_mul_pd(r01,vftabscale);
863             vfitab           = _mm_cvttpd_epi32(rt);
864             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
865             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
866
867             /* CUBIC SPLINE TABLE ELECTROSTATICS */
868             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
869             F                = _mm_setzero_pd();
870             GMX_MM_TRANSPOSE2_PD(Y,F);
871             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
872             H                = _mm_setzero_pd();
873             GMX_MM_TRANSPOSE2_PD(G,H);
874             Heps             = _mm_mul_pd(vfeps,H);
875             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
876             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
877             velec            = _mm_mul_pd(qq01,VV);
878             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
879             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
880
881             /* Update potential sum for this i atom from the interaction with this j atom. */
882             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
883             velecsum         = _mm_add_pd(velecsum,velec);
884
885             fscal            = felec;
886
887             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
888
889             /* Calculate temporary vectorial force */
890             tx               = _mm_mul_pd(fscal,dx01);
891             ty               = _mm_mul_pd(fscal,dy01);
892             tz               = _mm_mul_pd(fscal,dz01);
893
894             /* Update vectorial force */
895             fix0             = _mm_add_pd(fix0,tx);
896             fiy0             = _mm_add_pd(fiy0,ty);
897             fiz0             = _mm_add_pd(fiz0,tz);
898
899             fjx1             = _mm_add_pd(fjx1,tx);
900             fjy1             = _mm_add_pd(fjy1,ty);
901             fjz1             = _mm_add_pd(fjz1,tz);
902
903             /**************************
904              * CALCULATE INTERACTIONS *
905              **************************/
906
907             r02              = _mm_mul_pd(rsq02,rinv02);
908
909             /* Calculate table index by multiplying r with table scale and truncate to integer */
910             rt               = _mm_mul_pd(r02,vftabscale);
911             vfitab           = _mm_cvttpd_epi32(rt);
912             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
913             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
914
915             /* CUBIC SPLINE TABLE ELECTROSTATICS */
916             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
917             F                = _mm_setzero_pd();
918             GMX_MM_TRANSPOSE2_PD(Y,F);
919             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
920             H                = _mm_setzero_pd();
921             GMX_MM_TRANSPOSE2_PD(G,H);
922             Heps             = _mm_mul_pd(vfeps,H);
923             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
924             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
925             velec            = _mm_mul_pd(qq02,VV);
926             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
927             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
928
929             /* Update potential sum for this i atom from the interaction with this j atom. */
930             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
931             velecsum         = _mm_add_pd(velecsum,velec);
932
933             fscal            = felec;
934
935             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
936
937             /* Calculate temporary vectorial force */
938             tx               = _mm_mul_pd(fscal,dx02);
939             ty               = _mm_mul_pd(fscal,dy02);
940             tz               = _mm_mul_pd(fscal,dz02);
941
942             /* Update vectorial force */
943             fix0             = _mm_add_pd(fix0,tx);
944             fiy0             = _mm_add_pd(fiy0,ty);
945             fiz0             = _mm_add_pd(fiz0,tz);
946
947             fjx2             = _mm_add_pd(fjx2,tx);
948             fjy2             = _mm_add_pd(fjy2,ty);
949             fjz2             = _mm_add_pd(fjz2,tz);
950
951             /**************************
952              * CALCULATE INTERACTIONS *
953              **************************/
954
955             r10              = _mm_mul_pd(rsq10,rinv10);
956
957             /* Calculate table index by multiplying r with table scale and truncate to integer */
958             rt               = _mm_mul_pd(r10,vftabscale);
959             vfitab           = _mm_cvttpd_epi32(rt);
960             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
961             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
962
963             /* CUBIC SPLINE TABLE ELECTROSTATICS */
964             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
965             F                = _mm_setzero_pd();
966             GMX_MM_TRANSPOSE2_PD(Y,F);
967             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
968             H                = _mm_setzero_pd();
969             GMX_MM_TRANSPOSE2_PD(G,H);
970             Heps             = _mm_mul_pd(vfeps,H);
971             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
972             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
973             velec            = _mm_mul_pd(qq10,VV);
974             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
975             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
976
977             /* Update potential sum for this i atom from the interaction with this j atom. */
978             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
979             velecsum         = _mm_add_pd(velecsum,velec);
980
981             fscal            = felec;
982
983             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
984
985             /* Calculate temporary vectorial force */
986             tx               = _mm_mul_pd(fscal,dx10);
987             ty               = _mm_mul_pd(fscal,dy10);
988             tz               = _mm_mul_pd(fscal,dz10);
989
990             /* Update vectorial force */
991             fix1             = _mm_add_pd(fix1,tx);
992             fiy1             = _mm_add_pd(fiy1,ty);
993             fiz1             = _mm_add_pd(fiz1,tz);
994
995             fjx0             = _mm_add_pd(fjx0,tx);
996             fjy0             = _mm_add_pd(fjy0,ty);
997             fjz0             = _mm_add_pd(fjz0,tz);
998
999             /**************************
1000              * CALCULATE INTERACTIONS *
1001              **************************/
1002
1003             r11              = _mm_mul_pd(rsq11,rinv11);
1004
1005             /* Calculate table index by multiplying r with table scale and truncate to integer */
1006             rt               = _mm_mul_pd(r11,vftabscale);
1007             vfitab           = _mm_cvttpd_epi32(rt);
1008             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1009             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1010
1011             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1012             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1013             F                = _mm_setzero_pd();
1014             GMX_MM_TRANSPOSE2_PD(Y,F);
1015             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1016             H                = _mm_setzero_pd();
1017             GMX_MM_TRANSPOSE2_PD(G,H);
1018             Heps             = _mm_mul_pd(vfeps,H);
1019             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1020             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1021             velec            = _mm_mul_pd(qq11,VV);
1022             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1023             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1024
1025             /* Update potential sum for this i atom from the interaction with this j atom. */
1026             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1027             velecsum         = _mm_add_pd(velecsum,velec);
1028
1029             fscal            = felec;
1030
1031             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1032
1033             /* Calculate temporary vectorial force */
1034             tx               = _mm_mul_pd(fscal,dx11);
1035             ty               = _mm_mul_pd(fscal,dy11);
1036             tz               = _mm_mul_pd(fscal,dz11);
1037
1038             /* Update vectorial force */
1039             fix1             = _mm_add_pd(fix1,tx);
1040             fiy1             = _mm_add_pd(fiy1,ty);
1041             fiz1             = _mm_add_pd(fiz1,tz);
1042
1043             fjx1             = _mm_add_pd(fjx1,tx);
1044             fjy1             = _mm_add_pd(fjy1,ty);
1045             fjz1             = _mm_add_pd(fjz1,tz);
1046
1047             /**************************
1048              * CALCULATE INTERACTIONS *
1049              **************************/
1050
1051             r12              = _mm_mul_pd(rsq12,rinv12);
1052
1053             /* Calculate table index by multiplying r with table scale and truncate to integer */
1054             rt               = _mm_mul_pd(r12,vftabscale);
1055             vfitab           = _mm_cvttpd_epi32(rt);
1056             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1057             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1058
1059             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1060             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1061             F                = _mm_setzero_pd();
1062             GMX_MM_TRANSPOSE2_PD(Y,F);
1063             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1064             H                = _mm_setzero_pd();
1065             GMX_MM_TRANSPOSE2_PD(G,H);
1066             Heps             = _mm_mul_pd(vfeps,H);
1067             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1068             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1069             velec            = _mm_mul_pd(qq12,VV);
1070             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1071             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1072
1073             /* Update potential sum for this i atom from the interaction with this j atom. */
1074             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1075             velecsum         = _mm_add_pd(velecsum,velec);
1076
1077             fscal            = felec;
1078
1079             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1080
1081             /* Calculate temporary vectorial force */
1082             tx               = _mm_mul_pd(fscal,dx12);
1083             ty               = _mm_mul_pd(fscal,dy12);
1084             tz               = _mm_mul_pd(fscal,dz12);
1085
1086             /* Update vectorial force */
1087             fix1             = _mm_add_pd(fix1,tx);
1088             fiy1             = _mm_add_pd(fiy1,ty);
1089             fiz1             = _mm_add_pd(fiz1,tz);
1090
1091             fjx2             = _mm_add_pd(fjx2,tx);
1092             fjy2             = _mm_add_pd(fjy2,ty);
1093             fjz2             = _mm_add_pd(fjz2,tz);
1094
1095             /**************************
1096              * CALCULATE INTERACTIONS *
1097              **************************/
1098
1099             r20              = _mm_mul_pd(rsq20,rinv20);
1100
1101             /* Calculate table index by multiplying r with table scale and truncate to integer */
1102             rt               = _mm_mul_pd(r20,vftabscale);
1103             vfitab           = _mm_cvttpd_epi32(rt);
1104             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1105             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1106
1107             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1108             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1109             F                = _mm_setzero_pd();
1110             GMX_MM_TRANSPOSE2_PD(Y,F);
1111             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1112             H                = _mm_setzero_pd();
1113             GMX_MM_TRANSPOSE2_PD(G,H);
1114             Heps             = _mm_mul_pd(vfeps,H);
1115             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1116             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1117             velec            = _mm_mul_pd(qq20,VV);
1118             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1119             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1120
1121             /* Update potential sum for this i atom from the interaction with this j atom. */
1122             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1123             velecsum         = _mm_add_pd(velecsum,velec);
1124
1125             fscal            = felec;
1126
1127             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1128
1129             /* Calculate temporary vectorial force */
1130             tx               = _mm_mul_pd(fscal,dx20);
1131             ty               = _mm_mul_pd(fscal,dy20);
1132             tz               = _mm_mul_pd(fscal,dz20);
1133
1134             /* Update vectorial force */
1135             fix2             = _mm_add_pd(fix2,tx);
1136             fiy2             = _mm_add_pd(fiy2,ty);
1137             fiz2             = _mm_add_pd(fiz2,tz);
1138
1139             fjx0             = _mm_add_pd(fjx0,tx);
1140             fjy0             = _mm_add_pd(fjy0,ty);
1141             fjz0             = _mm_add_pd(fjz0,tz);
1142
1143             /**************************
1144              * CALCULATE INTERACTIONS *
1145              **************************/
1146
1147             r21              = _mm_mul_pd(rsq21,rinv21);
1148
1149             /* Calculate table index by multiplying r with table scale and truncate to integer */
1150             rt               = _mm_mul_pd(r21,vftabscale);
1151             vfitab           = _mm_cvttpd_epi32(rt);
1152             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1153             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1154
1155             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1156             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1157             F                = _mm_setzero_pd();
1158             GMX_MM_TRANSPOSE2_PD(Y,F);
1159             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1160             H                = _mm_setzero_pd();
1161             GMX_MM_TRANSPOSE2_PD(G,H);
1162             Heps             = _mm_mul_pd(vfeps,H);
1163             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1164             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1165             velec            = _mm_mul_pd(qq21,VV);
1166             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1167             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1168
1169             /* Update potential sum for this i atom from the interaction with this j atom. */
1170             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1171             velecsum         = _mm_add_pd(velecsum,velec);
1172
1173             fscal            = felec;
1174
1175             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1176
1177             /* Calculate temporary vectorial force */
1178             tx               = _mm_mul_pd(fscal,dx21);
1179             ty               = _mm_mul_pd(fscal,dy21);
1180             tz               = _mm_mul_pd(fscal,dz21);
1181
1182             /* Update vectorial force */
1183             fix2             = _mm_add_pd(fix2,tx);
1184             fiy2             = _mm_add_pd(fiy2,ty);
1185             fiz2             = _mm_add_pd(fiz2,tz);
1186
1187             fjx1             = _mm_add_pd(fjx1,tx);
1188             fjy1             = _mm_add_pd(fjy1,ty);
1189             fjz1             = _mm_add_pd(fjz1,tz);
1190
1191             /**************************
1192              * CALCULATE INTERACTIONS *
1193              **************************/
1194
1195             r22              = _mm_mul_pd(rsq22,rinv22);
1196
1197             /* Calculate table index by multiplying r with table scale and truncate to integer */
1198             rt               = _mm_mul_pd(r22,vftabscale);
1199             vfitab           = _mm_cvttpd_epi32(rt);
1200             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1201             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1202
1203             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1204             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1205             F                = _mm_setzero_pd();
1206             GMX_MM_TRANSPOSE2_PD(Y,F);
1207             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1208             H                = _mm_setzero_pd();
1209             GMX_MM_TRANSPOSE2_PD(G,H);
1210             Heps             = _mm_mul_pd(vfeps,H);
1211             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1212             VV               = _mm_add_pd(Y,_mm_mul_pd(vfeps,Fp));
1213             velec            = _mm_mul_pd(qq22,VV);
1214             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1215             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1216
1217             /* Update potential sum for this i atom from the interaction with this j atom. */
1218             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1219             velecsum         = _mm_add_pd(velecsum,velec);
1220
1221             fscal            = felec;
1222
1223             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1224
1225             /* Calculate temporary vectorial force */
1226             tx               = _mm_mul_pd(fscal,dx22);
1227             ty               = _mm_mul_pd(fscal,dy22);
1228             tz               = _mm_mul_pd(fscal,dz22);
1229
1230             /* Update vectorial force */
1231             fix2             = _mm_add_pd(fix2,tx);
1232             fiy2             = _mm_add_pd(fiy2,ty);
1233             fiz2             = _mm_add_pd(fiz2,tz);
1234
1235             fjx2             = _mm_add_pd(fjx2,tx);
1236             fjy2             = _mm_add_pd(fjy2,ty);
1237             fjz2             = _mm_add_pd(fjz2,tz);
1238
1239             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1240
1241             /* Inner loop uses 417 flops */
1242         }
1243
1244         /* End of innermost loop */
1245
1246         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1247                                               f+i_coord_offset,fshift+i_shift_offset);
1248
1249         ggid                        = gid[iidx];
1250         /* Update potential energies */
1251         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1252         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1253
1254         /* Increment number of inner iterations */
1255         inneriter                  += j_index_end - j_index_start;
1256
1257         /* Outer loop uses 20 flops */
1258     }
1259
1260     /* Increment number of outer iterations */
1261     outeriter        += nri;
1262
1263     /* Update outer/inner flops */
1264
1265     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*417);
1266 }
1267 /*
1268  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_double
1269  * Electrostatics interaction: CubicSplineTable
1270  * VdW interaction:            CubicSplineTable
1271  * Geometry:                   Water3-Water3
1272  * Calculate force/pot:        Force
1273  */
1274 void
1275 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_double
1276                     (t_nblist * gmx_restrict                nlist,
1277                      rvec * gmx_restrict                    xx,
1278                      rvec * gmx_restrict                    ff,
1279                      t_forcerec * gmx_restrict              fr,
1280                      t_mdatoms * gmx_restrict               mdatoms,
1281                      nb_kernel_data_t * gmx_restrict        kernel_data,
1282                      t_nrnb * gmx_restrict                  nrnb)
1283 {
1284     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1285      * just 0 for non-waters.
1286      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1287      * jnr indices corresponding to data put in the four positions in the SIMD register.
1288      */
1289     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1290     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1291     int              jnrA,jnrB;
1292     int              j_coord_offsetA,j_coord_offsetB;
1293     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1294     real             rcutoff_scalar;
1295     real             *shiftvec,*fshift,*x,*f;
1296     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1297     int              vdwioffset0;
1298     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1299     int              vdwioffset1;
1300     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1301     int              vdwioffset2;
1302     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1303     int              vdwjidx0A,vdwjidx0B;
1304     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1305     int              vdwjidx1A,vdwjidx1B;
1306     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1307     int              vdwjidx2A,vdwjidx2B;
1308     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1309     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1310     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1311     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1312     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1313     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1314     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1315     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1316     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1317     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1318     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1319     real             *charge;
1320     int              nvdwtype;
1321     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1322     int              *vdwtype;
1323     real             *vdwparam;
1324     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1325     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1326     __m128i          vfitab;
1327     __m128i          ifour       = _mm_set1_epi32(4);
1328     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1329     real             *vftab;
1330     __m128d          dummy_mask,cutoff_mask;
1331     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1332     __m128d          one     = _mm_set1_pd(1.0);
1333     __m128d          two     = _mm_set1_pd(2.0);
1334     x                = xx[0];
1335     f                = ff[0];
1336
1337     nri              = nlist->nri;
1338     iinr             = nlist->iinr;
1339     jindex           = nlist->jindex;
1340     jjnr             = nlist->jjnr;
1341     shiftidx         = nlist->shift;
1342     gid              = nlist->gid;
1343     shiftvec         = fr->shift_vec[0];
1344     fshift           = fr->fshift[0];
1345     facel            = _mm_set1_pd(fr->epsfac);
1346     charge           = mdatoms->chargeA;
1347     nvdwtype         = fr->ntype;
1348     vdwparam         = fr->nbfp;
1349     vdwtype          = mdatoms->typeA;
1350
1351     vftab            = kernel_data->table_elec_vdw->data;
1352     vftabscale       = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
1353
1354     /* Setup water-specific parameters */
1355     inr              = nlist->iinr[0];
1356     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1357     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1358     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1359     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1360
1361     jq0              = _mm_set1_pd(charge[inr+0]);
1362     jq1              = _mm_set1_pd(charge[inr+1]);
1363     jq2              = _mm_set1_pd(charge[inr+2]);
1364     vdwjidx0A        = 2*vdwtype[inr+0];
1365     qq00             = _mm_mul_pd(iq0,jq0);
1366     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1367     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1368     qq01             = _mm_mul_pd(iq0,jq1);
1369     qq02             = _mm_mul_pd(iq0,jq2);
1370     qq10             = _mm_mul_pd(iq1,jq0);
1371     qq11             = _mm_mul_pd(iq1,jq1);
1372     qq12             = _mm_mul_pd(iq1,jq2);
1373     qq20             = _mm_mul_pd(iq2,jq0);
1374     qq21             = _mm_mul_pd(iq2,jq1);
1375     qq22             = _mm_mul_pd(iq2,jq2);
1376
1377     /* Avoid stupid compiler warnings */
1378     jnrA = jnrB = 0;
1379     j_coord_offsetA = 0;
1380     j_coord_offsetB = 0;
1381
1382     outeriter        = 0;
1383     inneriter        = 0;
1384
1385     /* Start outer loop over neighborlists */
1386     for(iidx=0; iidx<nri; iidx++)
1387     {
1388         /* Load shift vector for this list */
1389         i_shift_offset   = DIM*shiftidx[iidx];
1390
1391         /* Load limits for loop over neighbors */
1392         j_index_start    = jindex[iidx];
1393         j_index_end      = jindex[iidx+1];
1394
1395         /* Get outer coordinate index */
1396         inr              = iinr[iidx];
1397         i_coord_offset   = DIM*inr;
1398
1399         /* Load i particle coords and add shift vector */
1400         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1401                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1402
1403         fix0             = _mm_setzero_pd();
1404         fiy0             = _mm_setzero_pd();
1405         fiz0             = _mm_setzero_pd();
1406         fix1             = _mm_setzero_pd();
1407         fiy1             = _mm_setzero_pd();
1408         fiz1             = _mm_setzero_pd();
1409         fix2             = _mm_setzero_pd();
1410         fiy2             = _mm_setzero_pd();
1411         fiz2             = _mm_setzero_pd();
1412
1413         /* Start inner kernel loop */
1414         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1415         {
1416
1417             /* Get j neighbor index, and coordinate index */
1418             jnrA             = jjnr[jidx];
1419             jnrB             = jjnr[jidx+1];
1420             j_coord_offsetA  = DIM*jnrA;
1421             j_coord_offsetB  = DIM*jnrB;
1422             
1423             /* load j atom coordinates */
1424             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1425                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1426             
1427             /* Calculate displacement vector */
1428             dx00             = _mm_sub_pd(ix0,jx0);
1429             dy00             = _mm_sub_pd(iy0,jy0);
1430             dz00             = _mm_sub_pd(iz0,jz0);
1431             dx01             = _mm_sub_pd(ix0,jx1);
1432             dy01             = _mm_sub_pd(iy0,jy1);
1433             dz01             = _mm_sub_pd(iz0,jz1);
1434             dx02             = _mm_sub_pd(ix0,jx2);
1435             dy02             = _mm_sub_pd(iy0,jy2);
1436             dz02             = _mm_sub_pd(iz0,jz2);
1437             dx10             = _mm_sub_pd(ix1,jx0);
1438             dy10             = _mm_sub_pd(iy1,jy0);
1439             dz10             = _mm_sub_pd(iz1,jz0);
1440             dx11             = _mm_sub_pd(ix1,jx1);
1441             dy11             = _mm_sub_pd(iy1,jy1);
1442             dz11             = _mm_sub_pd(iz1,jz1);
1443             dx12             = _mm_sub_pd(ix1,jx2);
1444             dy12             = _mm_sub_pd(iy1,jy2);
1445             dz12             = _mm_sub_pd(iz1,jz2);
1446             dx20             = _mm_sub_pd(ix2,jx0);
1447             dy20             = _mm_sub_pd(iy2,jy0);
1448             dz20             = _mm_sub_pd(iz2,jz0);
1449             dx21             = _mm_sub_pd(ix2,jx1);
1450             dy21             = _mm_sub_pd(iy2,jy1);
1451             dz21             = _mm_sub_pd(iz2,jz1);
1452             dx22             = _mm_sub_pd(ix2,jx2);
1453             dy22             = _mm_sub_pd(iy2,jy2);
1454             dz22             = _mm_sub_pd(iz2,jz2);
1455
1456             /* Calculate squared distance and things based on it */
1457             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1458             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1459             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1460             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1461             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1462             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1463             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1464             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1465             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1466
1467             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1468             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1469             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1470             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1471             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1472             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1473             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1474             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1475             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1476
1477             fjx0             = _mm_setzero_pd();
1478             fjy0             = _mm_setzero_pd();
1479             fjz0             = _mm_setzero_pd();
1480             fjx1             = _mm_setzero_pd();
1481             fjy1             = _mm_setzero_pd();
1482             fjz1             = _mm_setzero_pd();
1483             fjx2             = _mm_setzero_pd();
1484             fjy2             = _mm_setzero_pd();
1485             fjz2             = _mm_setzero_pd();
1486
1487             /**************************
1488              * CALCULATE INTERACTIONS *
1489              **************************/
1490
1491             r00              = _mm_mul_pd(rsq00,rinv00);
1492
1493             /* Calculate table index by multiplying r with table scale and truncate to integer */
1494             rt               = _mm_mul_pd(r00,vftabscale);
1495             vfitab           = _mm_cvttpd_epi32(rt);
1496             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1497             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1498
1499             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1500             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1501             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1502             GMX_MM_TRANSPOSE2_PD(Y,F);
1503             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1504             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1505             GMX_MM_TRANSPOSE2_PD(G,H);
1506             Heps             = _mm_mul_pd(vfeps,H);
1507             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1508             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1509             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1510
1511             /* CUBIC SPLINE TABLE DISPERSION */
1512             vfitab           = _mm_add_epi32(vfitab,ifour);
1513             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1514             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1515             GMX_MM_TRANSPOSE2_PD(Y,F);
1516             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1517             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1518             GMX_MM_TRANSPOSE2_PD(G,H);
1519             Heps             = _mm_mul_pd(vfeps,H);
1520             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1521             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1522             fvdw6            = _mm_mul_pd(c6_00,FF);
1523
1524             /* CUBIC SPLINE TABLE REPULSION */
1525             vfitab           = _mm_add_epi32(vfitab,ifour);
1526             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1527             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1528             GMX_MM_TRANSPOSE2_PD(Y,F);
1529             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1530             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1531             GMX_MM_TRANSPOSE2_PD(G,H);
1532             Heps             = _mm_mul_pd(vfeps,H);
1533             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1534             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1535             fvdw12           = _mm_mul_pd(c12_00,FF);
1536             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1537
1538             fscal            = _mm_add_pd(felec,fvdw);
1539
1540             /* Calculate temporary vectorial force */
1541             tx               = _mm_mul_pd(fscal,dx00);
1542             ty               = _mm_mul_pd(fscal,dy00);
1543             tz               = _mm_mul_pd(fscal,dz00);
1544
1545             /* Update vectorial force */
1546             fix0             = _mm_add_pd(fix0,tx);
1547             fiy0             = _mm_add_pd(fiy0,ty);
1548             fiz0             = _mm_add_pd(fiz0,tz);
1549
1550             fjx0             = _mm_add_pd(fjx0,tx);
1551             fjy0             = _mm_add_pd(fjy0,ty);
1552             fjz0             = _mm_add_pd(fjz0,tz);
1553
1554             /**************************
1555              * CALCULATE INTERACTIONS *
1556              **************************/
1557
1558             r01              = _mm_mul_pd(rsq01,rinv01);
1559
1560             /* Calculate table index by multiplying r with table scale and truncate to integer */
1561             rt               = _mm_mul_pd(r01,vftabscale);
1562             vfitab           = _mm_cvttpd_epi32(rt);
1563             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1564             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1565
1566             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1567             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1568             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1569             GMX_MM_TRANSPOSE2_PD(Y,F);
1570             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1571             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1572             GMX_MM_TRANSPOSE2_PD(G,H);
1573             Heps             = _mm_mul_pd(vfeps,H);
1574             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1575             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1576             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1577
1578             fscal            = felec;
1579
1580             /* Calculate temporary vectorial force */
1581             tx               = _mm_mul_pd(fscal,dx01);
1582             ty               = _mm_mul_pd(fscal,dy01);
1583             tz               = _mm_mul_pd(fscal,dz01);
1584
1585             /* Update vectorial force */
1586             fix0             = _mm_add_pd(fix0,tx);
1587             fiy0             = _mm_add_pd(fiy0,ty);
1588             fiz0             = _mm_add_pd(fiz0,tz);
1589
1590             fjx1             = _mm_add_pd(fjx1,tx);
1591             fjy1             = _mm_add_pd(fjy1,ty);
1592             fjz1             = _mm_add_pd(fjz1,tz);
1593
1594             /**************************
1595              * CALCULATE INTERACTIONS *
1596              **************************/
1597
1598             r02              = _mm_mul_pd(rsq02,rinv02);
1599
1600             /* Calculate table index by multiplying r with table scale and truncate to integer */
1601             rt               = _mm_mul_pd(r02,vftabscale);
1602             vfitab           = _mm_cvttpd_epi32(rt);
1603             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1604             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1605
1606             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1607             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1608             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1609             GMX_MM_TRANSPOSE2_PD(Y,F);
1610             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1611             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1612             GMX_MM_TRANSPOSE2_PD(G,H);
1613             Heps             = _mm_mul_pd(vfeps,H);
1614             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1615             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1616             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1617
1618             fscal            = felec;
1619
1620             /* Calculate temporary vectorial force */
1621             tx               = _mm_mul_pd(fscal,dx02);
1622             ty               = _mm_mul_pd(fscal,dy02);
1623             tz               = _mm_mul_pd(fscal,dz02);
1624
1625             /* Update vectorial force */
1626             fix0             = _mm_add_pd(fix0,tx);
1627             fiy0             = _mm_add_pd(fiy0,ty);
1628             fiz0             = _mm_add_pd(fiz0,tz);
1629
1630             fjx2             = _mm_add_pd(fjx2,tx);
1631             fjy2             = _mm_add_pd(fjy2,ty);
1632             fjz2             = _mm_add_pd(fjz2,tz);
1633
1634             /**************************
1635              * CALCULATE INTERACTIONS *
1636              **************************/
1637
1638             r10              = _mm_mul_pd(rsq10,rinv10);
1639
1640             /* Calculate table index by multiplying r with table scale and truncate to integer */
1641             rt               = _mm_mul_pd(r10,vftabscale);
1642             vfitab           = _mm_cvttpd_epi32(rt);
1643             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1644             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1645
1646             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1647             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1648             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1649             GMX_MM_TRANSPOSE2_PD(Y,F);
1650             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1651             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1652             GMX_MM_TRANSPOSE2_PD(G,H);
1653             Heps             = _mm_mul_pd(vfeps,H);
1654             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1655             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1656             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1657
1658             fscal            = felec;
1659
1660             /* Calculate temporary vectorial force */
1661             tx               = _mm_mul_pd(fscal,dx10);
1662             ty               = _mm_mul_pd(fscal,dy10);
1663             tz               = _mm_mul_pd(fscal,dz10);
1664
1665             /* Update vectorial force */
1666             fix1             = _mm_add_pd(fix1,tx);
1667             fiy1             = _mm_add_pd(fiy1,ty);
1668             fiz1             = _mm_add_pd(fiz1,tz);
1669
1670             fjx0             = _mm_add_pd(fjx0,tx);
1671             fjy0             = _mm_add_pd(fjy0,ty);
1672             fjz0             = _mm_add_pd(fjz0,tz);
1673
1674             /**************************
1675              * CALCULATE INTERACTIONS *
1676              **************************/
1677
1678             r11              = _mm_mul_pd(rsq11,rinv11);
1679
1680             /* Calculate table index by multiplying r with table scale and truncate to integer */
1681             rt               = _mm_mul_pd(r11,vftabscale);
1682             vfitab           = _mm_cvttpd_epi32(rt);
1683             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1684             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1685
1686             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1687             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1688             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1689             GMX_MM_TRANSPOSE2_PD(Y,F);
1690             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1691             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1692             GMX_MM_TRANSPOSE2_PD(G,H);
1693             Heps             = _mm_mul_pd(vfeps,H);
1694             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1695             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1696             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1697
1698             fscal            = felec;
1699
1700             /* Calculate temporary vectorial force */
1701             tx               = _mm_mul_pd(fscal,dx11);
1702             ty               = _mm_mul_pd(fscal,dy11);
1703             tz               = _mm_mul_pd(fscal,dz11);
1704
1705             /* Update vectorial force */
1706             fix1             = _mm_add_pd(fix1,tx);
1707             fiy1             = _mm_add_pd(fiy1,ty);
1708             fiz1             = _mm_add_pd(fiz1,tz);
1709
1710             fjx1             = _mm_add_pd(fjx1,tx);
1711             fjy1             = _mm_add_pd(fjy1,ty);
1712             fjz1             = _mm_add_pd(fjz1,tz);
1713
1714             /**************************
1715              * CALCULATE INTERACTIONS *
1716              **************************/
1717
1718             r12              = _mm_mul_pd(rsq12,rinv12);
1719
1720             /* Calculate table index by multiplying r with table scale and truncate to integer */
1721             rt               = _mm_mul_pd(r12,vftabscale);
1722             vfitab           = _mm_cvttpd_epi32(rt);
1723             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1724             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1725
1726             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1727             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1728             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1729             GMX_MM_TRANSPOSE2_PD(Y,F);
1730             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1731             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1732             GMX_MM_TRANSPOSE2_PD(G,H);
1733             Heps             = _mm_mul_pd(vfeps,H);
1734             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1735             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1736             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1737
1738             fscal            = felec;
1739
1740             /* Calculate temporary vectorial force */
1741             tx               = _mm_mul_pd(fscal,dx12);
1742             ty               = _mm_mul_pd(fscal,dy12);
1743             tz               = _mm_mul_pd(fscal,dz12);
1744
1745             /* Update vectorial force */
1746             fix1             = _mm_add_pd(fix1,tx);
1747             fiy1             = _mm_add_pd(fiy1,ty);
1748             fiz1             = _mm_add_pd(fiz1,tz);
1749
1750             fjx2             = _mm_add_pd(fjx2,tx);
1751             fjy2             = _mm_add_pd(fjy2,ty);
1752             fjz2             = _mm_add_pd(fjz2,tz);
1753
1754             /**************************
1755              * CALCULATE INTERACTIONS *
1756              **************************/
1757
1758             r20              = _mm_mul_pd(rsq20,rinv20);
1759
1760             /* Calculate table index by multiplying r with table scale and truncate to integer */
1761             rt               = _mm_mul_pd(r20,vftabscale);
1762             vfitab           = _mm_cvttpd_epi32(rt);
1763             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1764             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1765
1766             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1767             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1768             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1769             GMX_MM_TRANSPOSE2_PD(Y,F);
1770             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1771             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1772             GMX_MM_TRANSPOSE2_PD(G,H);
1773             Heps             = _mm_mul_pd(vfeps,H);
1774             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1775             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1776             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1777
1778             fscal            = felec;
1779
1780             /* Calculate temporary vectorial force */
1781             tx               = _mm_mul_pd(fscal,dx20);
1782             ty               = _mm_mul_pd(fscal,dy20);
1783             tz               = _mm_mul_pd(fscal,dz20);
1784
1785             /* Update vectorial force */
1786             fix2             = _mm_add_pd(fix2,tx);
1787             fiy2             = _mm_add_pd(fiy2,ty);
1788             fiz2             = _mm_add_pd(fiz2,tz);
1789
1790             fjx0             = _mm_add_pd(fjx0,tx);
1791             fjy0             = _mm_add_pd(fjy0,ty);
1792             fjz0             = _mm_add_pd(fjz0,tz);
1793
1794             /**************************
1795              * CALCULATE INTERACTIONS *
1796              **************************/
1797
1798             r21              = _mm_mul_pd(rsq21,rinv21);
1799
1800             /* Calculate table index by multiplying r with table scale and truncate to integer */
1801             rt               = _mm_mul_pd(r21,vftabscale);
1802             vfitab           = _mm_cvttpd_epi32(rt);
1803             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1804             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1805
1806             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1807             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1808             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1809             GMX_MM_TRANSPOSE2_PD(Y,F);
1810             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1811             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1812             GMX_MM_TRANSPOSE2_PD(G,H);
1813             Heps             = _mm_mul_pd(vfeps,H);
1814             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1815             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1816             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1817
1818             fscal            = felec;
1819
1820             /* Calculate temporary vectorial force */
1821             tx               = _mm_mul_pd(fscal,dx21);
1822             ty               = _mm_mul_pd(fscal,dy21);
1823             tz               = _mm_mul_pd(fscal,dz21);
1824
1825             /* Update vectorial force */
1826             fix2             = _mm_add_pd(fix2,tx);
1827             fiy2             = _mm_add_pd(fiy2,ty);
1828             fiz2             = _mm_add_pd(fiz2,tz);
1829
1830             fjx1             = _mm_add_pd(fjx1,tx);
1831             fjy1             = _mm_add_pd(fjy1,ty);
1832             fjz1             = _mm_add_pd(fjz1,tz);
1833
1834             /**************************
1835              * CALCULATE INTERACTIONS *
1836              **************************/
1837
1838             r22              = _mm_mul_pd(rsq22,rinv22);
1839
1840             /* Calculate table index by multiplying r with table scale and truncate to integer */
1841             rt               = _mm_mul_pd(r22,vftabscale);
1842             vfitab           = _mm_cvttpd_epi32(rt);
1843             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1844             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1845
1846             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1847             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1848             F                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) );
1849             GMX_MM_TRANSPOSE2_PD(Y,F);
1850             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1851             H                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,1) +2);
1852             GMX_MM_TRANSPOSE2_PD(G,H);
1853             Heps             = _mm_mul_pd(vfeps,H);
1854             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1855             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1856             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1857
1858             fscal            = felec;
1859
1860             /* Calculate temporary vectorial force */
1861             tx               = _mm_mul_pd(fscal,dx22);
1862             ty               = _mm_mul_pd(fscal,dy22);
1863             tz               = _mm_mul_pd(fscal,dz22);
1864
1865             /* Update vectorial force */
1866             fix2             = _mm_add_pd(fix2,tx);
1867             fiy2             = _mm_add_pd(fiy2,ty);
1868             fiz2             = _mm_add_pd(fiz2,tz);
1869
1870             fjx2             = _mm_add_pd(fjx2,tx);
1871             fjy2             = _mm_add_pd(fjy2,ty);
1872             fjz2             = _mm_add_pd(fjz2,tz);
1873
1874             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1875
1876             /* Inner loop uses 373 flops */
1877         }
1878
1879         if(jidx<j_index_end)
1880         {
1881
1882             jnrA             = jjnr[jidx];
1883             j_coord_offsetA  = DIM*jnrA;
1884
1885             /* load j atom coordinates */
1886             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1887                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1888             
1889             /* Calculate displacement vector */
1890             dx00             = _mm_sub_pd(ix0,jx0);
1891             dy00             = _mm_sub_pd(iy0,jy0);
1892             dz00             = _mm_sub_pd(iz0,jz0);
1893             dx01             = _mm_sub_pd(ix0,jx1);
1894             dy01             = _mm_sub_pd(iy0,jy1);
1895             dz01             = _mm_sub_pd(iz0,jz1);
1896             dx02             = _mm_sub_pd(ix0,jx2);
1897             dy02             = _mm_sub_pd(iy0,jy2);
1898             dz02             = _mm_sub_pd(iz0,jz2);
1899             dx10             = _mm_sub_pd(ix1,jx0);
1900             dy10             = _mm_sub_pd(iy1,jy0);
1901             dz10             = _mm_sub_pd(iz1,jz0);
1902             dx11             = _mm_sub_pd(ix1,jx1);
1903             dy11             = _mm_sub_pd(iy1,jy1);
1904             dz11             = _mm_sub_pd(iz1,jz1);
1905             dx12             = _mm_sub_pd(ix1,jx2);
1906             dy12             = _mm_sub_pd(iy1,jy2);
1907             dz12             = _mm_sub_pd(iz1,jz2);
1908             dx20             = _mm_sub_pd(ix2,jx0);
1909             dy20             = _mm_sub_pd(iy2,jy0);
1910             dz20             = _mm_sub_pd(iz2,jz0);
1911             dx21             = _mm_sub_pd(ix2,jx1);
1912             dy21             = _mm_sub_pd(iy2,jy1);
1913             dz21             = _mm_sub_pd(iz2,jz1);
1914             dx22             = _mm_sub_pd(ix2,jx2);
1915             dy22             = _mm_sub_pd(iy2,jy2);
1916             dz22             = _mm_sub_pd(iz2,jz2);
1917
1918             /* Calculate squared distance and things based on it */
1919             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1920             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1921             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1922             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1923             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1924             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1925             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1926             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1927             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1928
1929             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1930             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1931             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1932             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1933             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1934             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1935             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1936             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1937             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1938
1939             fjx0             = _mm_setzero_pd();
1940             fjy0             = _mm_setzero_pd();
1941             fjz0             = _mm_setzero_pd();
1942             fjx1             = _mm_setzero_pd();
1943             fjy1             = _mm_setzero_pd();
1944             fjz1             = _mm_setzero_pd();
1945             fjx2             = _mm_setzero_pd();
1946             fjy2             = _mm_setzero_pd();
1947             fjz2             = _mm_setzero_pd();
1948
1949             /**************************
1950              * CALCULATE INTERACTIONS *
1951              **************************/
1952
1953             r00              = _mm_mul_pd(rsq00,rinv00);
1954
1955             /* Calculate table index by multiplying r with table scale and truncate to integer */
1956             rt               = _mm_mul_pd(r00,vftabscale);
1957             vfitab           = _mm_cvttpd_epi32(rt);
1958             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
1959             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1960
1961             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1962             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1963             F                = _mm_setzero_pd();
1964             GMX_MM_TRANSPOSE2_PD(Y,F);
1965             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1966             H                = _mm_setzero_pd();
1967             GMX_MM_TRANSPOSE2_PD(G,H);
1968             Heps             = _mm_mul_pd(vfeps,H);
1969             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1970             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1971             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1972
1973             /* CUBIC SPLINE TABLE DISPERSION */
1974             vfitab           = _mm_add_epi32(vfitab,ifour);
1975             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1976             F                = _mm_setzero_pd();
1977             GMX_MM_TRANSPOSE2_PD(Y,F);
1978             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1979             H                = _mm_setzero_pd();
1980             GMX_MM_TRANSPOSE2_PD(G,H);
1981             Heps             = _mm_mul_pd(vfeps,H);
1982             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1983             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1984             fvdw6            = _mm_mul_pd(c6_00,FF);
1985
1986             /* CUBIC SPLINE TABLE REPULSION */
1987             vfitab           = _mm_add_epi32(vfitab,ifour);
1988             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
1989             F                = _mm_setzero_pd();
1990             GMX_MM_TRANSPOSE2_PD(Y,F);
1991             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
1992             H                = _mm_setzero_pd();
1993             GMX_MM_TRANSPOSE2_PD(G,H);
1994             Heps             = _mm_mul_pd(vfeps,H);
1995             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
1996             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
1997             fvdw12           = _mm_mul_pd(c12_00,FF);
1998             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1999
2000             fscal            = _mm_add_pd(felec,fvdw);
2001
2002             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2003
2004             /* Calculate temporary vectorial force */
2005             tx               = _mm_mul_pd(fscal,dx00);
2006             ty               = _mm_mul_pd(fscal,dy00);
2007             tz               = _mm_mul_pd(fscal,dz00);
2008
2009             /* Update vectorial force */
2010             fix0             = _mm_add_pd(fix0,tx);
2011             fiy0             = _mm_add_pd(fiy0,ty);
2012             fiz0             = _mm_add_pd(fiz0,tz);
2013
2014             fjx0             = _mm_add_pd(fjx0,tx);
2015             fjy0             = _mm_add_pd(fjy0,ty);
2016             fjz0             = _mm_add_pd(fjz0,tz);
2017
2018             /**************************
2019              * CALCULATE INTERACTIONS *
2020              **************************/
2021
2022             r01              = _mm_mul_pd(rsq01,rinv01);
2023
2024             /* Calculate table index by multiplying r with table scale and truncate to integer */
2025             rt               = _mm_mul_pd(r01,vftabscale);
2026             vfitab           = _mm_cvttpd_epi32(rt);
2027             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2028             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2029
2030             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2031             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2032             F                = _mm_setzero_pd();
2033             GMX_MM_TRANSPOSE2_PD(Y,F);
2034             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2035             H                = _mm_setzero_pd();
2036             GMX_MM_TRANSPOSE2_PD(G,H);
2037             Heps             = _mm_mul_pd(vfeps,H);
2038             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2039             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2040             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
2041
2042             fscal            = felec;
2043
2044             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2045
2046             /* Calculate temporary vectorial force */
2047             tx               = _mm_mul_pd(fscal,dx01);
2048             ty               = _mm_mul_pd(fscal,dy01);
2049             tz               = _mm_mul_pd(fscal,dz01);
2050
2051             /* Update vectorial force */
2052             fix0             = _mm_add_pd(fix0,tx);
2053             fiy0             = _mm_add_pd(fiy0,ty);
2054             fiz0             = _mm_add_pd(fiz0,tz);
2055
2056             fjx1             = _mm_add_pd(fjx1,tx);
2057             fjy1             = _mm_add_pd(fjy1,ty);
2058             fjz1             = _mm_add_pd(fjz1,tz);
2059
2060             /**************************
2061              * CALCULATE INTERACTIONS *
2062              **************************/
2063
2064             r02              = _mm_mul_pd(rsq02,rinv02);
2065
2066             /* Calculate table index by multiplying r with table scale and truncate to integer */
2067             rt               = _mm_mul_pd(r02,vftabscale);
2068             vfitab           = _mm_cvttpd_epi32(rt);
2069             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2070             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2071
2072             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2073             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2074             F                = _mm_setzero_pd();
2075             GMX_MM_TRANSPOSE2_PD(Y,F);
2076             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2077             H                = _mm_setzero_pd();
2078             GMX_MM_TRANSPOSE2_PD(G,H);
2079             Heps             = _mm_mul_pd(vfeps,H);
2080             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2081             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2082             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
2083
2084             fscal            = felec;
2085
2086             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2087
2088             /* Calculate temporary vectorial force */
2089             tx               = _mm_mul_pd(fscal,dx02);
2090             ty               = _mm_mul_pd(fscal,dy02);
2091             tz               = _mm_mul_pd(fscal,dz02);
2092
2093             /* Update vectorial force */
2094             fix0             = _mm_add_pd(fix0,tx);
2095             fiy0             = _mm_add_pd(fiy0,ty);
2096             fiz0             = _mm_add_pd(fiz0,tz);
2097
2098             fjx2             = _mm_add_pd(fjx2,tx);
2099             fjy2             = _mm_add_pd(fjy2,ty);
2100             fjz2             = _mm_add_pd(fjz2,tz);
2101
2102             /**************************
2103              * CALCULATE INTERACTIONS *
2104              **************************/
2105
2106             r10              = _mm_mul_pd(rsq10,rinv10);
2107
2108             /* Calculate table index by multiplying r with table scale and truncate to integer */
2109             rt               = _mm_mul_pd(r10,vftabscale);
2110             vfitab           = _mm_cvttpd_epi32(rt);
2111             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2112             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2113
2114             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2115             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2116             F                = _mm_setzero_pd();
2117             GMX_MM_TRANSPOSE2_PD(Y,F);
2118             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2119             H                = _mm_setzero_pd();
2120             GMX_MM_TRANSPOSE2_PD(G,H);
2121             Heps             = _mm_mul_pd(vfeps,H);
2122             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2123             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2124             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
2125
2126             fscal            = felec;
2127
2128             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2129
2130             /* Calculate temporary vectorial force */
2131             tx               = _mm_mul_pd(fscal,dx10);
2132             ty               = _mm_mul_pd(fscal,dy10);
2133             tz               = _mm_mul_pd(fscal,dz10);
2134
2135             /* Update vectorial force */
2136             fix1             = _mm_add_pd(fix1,tx);
2137             fiy1             = _mm_add_pd(fiy1,ty);
2138             fiz1             = _mm_add_pd(fiz1,tz);
2139
2140             fjx0             = _mm_add_pd(fjx0,tx);
2141             fjy0             = _mm_add_pd(fjy0,ty);
2142             fjz0             = _mm_add_pd(fjz0,tz);
2143
2144             /**************************
2145              * CALCULATE INTERACTIONS *
2146              **************************/
2147
2148             r11              = _mm_mul_pd(rsq11,rinv11);
2149
2150             /* Calculate table index by multiplying r with table scale and truncate to integer */
2151             rt               = _mm_mul_pd(r11,vftabscale);
2152             vfitab           = _mm_cvttpd_epi32(rt);
2153             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2154             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2155
2156             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2157             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2158             F                = _mm_setzero_pd();
2159             GMX_MM_TRANSPOSE2_PD(Y,F);
2160             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2161             H                = _mm_setzero_pd();
2162             GMX_MM_TRANSPOSE2_PD(G,H);
2163             Heps             = _mm_mul_pd(vfeps,H);
2164             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2165             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2166             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2167
2168             fscal            = felec;
2169
2170             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2171
2172             /* Calculate temporary vectorial force */
2173             tx               = _mm_mul_pd(fscal,dx11);
2174             ty               = _mm_mul_pd(fscal,dy11);
2175             tz               = _mm_mul_pd(fscal,dz11);
2176
2177             /* Update vectorial force */
2178             fix1             = _mm_add_pd(fix1,tx);
2179             fiy1             = _mm_add_pd(fiy1,ty);
2180             fiz1             = _mm_add_pd(fiz1,tz);
2181
2182             fjx1             = _mm_add_pd(fjx1,tx);
2183             fjy1             = _mm_add_pd(fjy1,ty);
2184             fjz1             = _mm_add_pd(fjz1,tz);
2185
2186             /**************************
2187              * CALCULATE INTERACTIONS *
2188              **************************/
2189
2190             r12              = _mm_mul_pd(rsq12,rinv12);
2191
2192             /* Calculate table index by multiplying r with table scale and truncate to integer */
2193             rt               = _mm_mul_pd(r12,vftabscale);
2194             vfitab           = _mm_cvttpd_epi32(rt);
2195             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2196             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2197
2198             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2199             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2200             F                = _mm_setzero_pd();
2201             GMX_MM_TRANSPOSE2_PD(Y,F);
2202             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2203             H                = _mm_setzero_pd();
2204             GMX_MM_TRANSPOSE2_PD(G,H);
2205             Heps             = _mm_mul_pd(vfeps,H);
2206             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2207             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2208             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2209
2210             fscal            = felec;
2211
2212             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2213
2214             /* Calculate temporary vectorial force */
2215             tx               = _mm_mul_pd(fscal,dx12);
2216             ty               = _mm_mul_pd(fscal,dy12);
2217             tz               = _mm_mul_pd(fscal,dz12);
2218
2219             /* Update vectorial force */
2220             fix1             = _mm_add_pd(fix1,tx);
2221             fiy1             = _mm_add_pd(fiy1,ty);
2222             fiz1             = _mm_add_pd(fiz1,tz);
2223
2224             fjx2             = _mm_add_pd(fjx2,tx);
2225             fjy2             = _mm_add_pd(fjy2,ty);
2226             fjz2             = _mm_add_pd(fjz2,tz);
2227
2228             /**************************
2229              * CALCULATE INTERACTIONS *
2230              **************************/
2231
2232             r20              = _mm_mul_pd(rsq20,rinv20);
2233
2234             /* Calculate table index by multiplying r with table scale and truncate to integer */
2235             rt               = _mm_mul_pd(r20,vftabscale);
2236             vfitab           = _mm_cvttpd_epi32(rt);
2237             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2238             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2239
2240             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2241             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2242             F                = _mm_setzero_pd();
2243             GMX_MM_TRANSPOSE2_PD(Y,F);
2244             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2245             H                = _mm_setzero_pd();
2246             GMX_MM_TRANSPOSE2_PD(G,H);
2247             Heps             = _mm_mul_pd(vfeps,H);
2248             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2249             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2250             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2251
2252             fscal            = felec;
2253
2254             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2255
2256             /* Calculate temporary vectorial force */
2257             tx               = _mm_mul_pd(fscal,dx20);
2258             ty               = _mm_mul_pd(fscal,dy20);
2259             tz               = _mm_mul_pd(fscal,dz20);
2260
2261             /* Update vectorial force */
2262             fix2             = _mm_add_pd(fix2,tx);
2263             fiy2             = _mm_add_pd(fiy2,ty);
2264             fiz2             = _mm_add_pd(fiz2,tz);
2265
2266             fjx0             = _mm_add_pd(fjx0,tx);
2267             fjy0             = _mm_add_pd(fjy0,ty);
2268             fjz0             = _mm_add_pd(fjz0,tz);
2269
2270             /**************************
2271              * CALCULATE INTERACTIONS *
2272              **************************/
2273
2274             r21              = _mm_mul_pd(rsq21,rinv21);
2275
2276             /* Calculate table index by multiplying r with table scale and truncate to integer */
2277             rt               = _mm_mul_pd(r21,vftabscale);
2278             vfitab           = _mm_cvttpd_epi32(rt);
2279             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2280             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2281
2282             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2283             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2284             F                = _mm_setzero_pd();
2285             GMX_MM_TRANSPOSE2_PD(Y,F);
2286             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2287             H                = _mm_setzero_pd();
2288             GMX_MM_TRANSPOSE2_PD(G,H);
2289             Heps             = _mm_mul_pd(vfeps,H);
2290             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2291             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2292             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2293
2294             fscal            = felec;
2295
2296             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2297
2298             /* Calculate temporary vectorial force */
2299             tx               = _mm_mul_pd(fscal,dx21);
2300             ty               = _mm_mul_pd(fscal,dy21);
2301             tz               = _mm_mul_pd(fscal,dz21);
2302
2303             /* Update vectorial force */
2304             fix2             = _mm_add_pd(fix2,tx);
2305             fiy2             = _mm_add_pd(fiy2,ty);
2306             fiz2             = _mm_add_pd(fiz2,tz);
2307
2308             fjx1             = _mm_add_pd(fjx1,tx);
2309             fjy1             = _mm_add_pd(fjy1,ty);
2310             fjz1             = _mm_add_pd(fjz1,tz);
2311
2312             /**************************
2313              * CALCULATE INTERACTIONS *
2314              **************************/
2315
2316             r22              = _mm_mul_pd(rsq22,rinv22);
2317
2318             /* Calculate table index by multiplying r with table scale and truncate to integer */
2319             rt               = _mm_mul_pd(r22,vftabscale);
2320             vfitab           = _mm_cvttpd_epi32(rt);
2321             vfeps            = _mm_sub_pd(rt,_mm_cvtepi32_pd(vfitab));
2322             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2323
2324             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2325             Y                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) );
2326             F                = _mm_setzero_pd();
2327             GMX_MM_TRANSPOSE2_PD(Y,F);
2328             G                = _mm_load_pd( vftab + gmx_mm_extract_epi32(vfitab,0) +2);
2329             H                = _mm_setzero_pd();
2330             GMX_MM_TRANSPOSE2_PD(G,H);
2331             Heps             = _mm_mul_pd(vfeps,H);
2332             Fp               = _mm_add_pd(F,_mm_mul_pd(vfeps,_mm_add_pd(G,Heps)));
2333             FF               = _mm_add_pd(Fp,_mm_mul_pd(vfeps,_mm_add_pd(G,_mm_add_pd(Heps,Heps))));
2334             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2335
2336             fscal            = felec;
2337
2338             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2339
2340             /* Calculate temporary vectorial force */
2341             tx               = _mm_mul_pd(fscal,dx22);
2342             ty               = _mm_mul_pd(fscal,dy22);
2343             tz               = _mm_mul_pd(fscal,dz22);
2344
2345             /* Update vectorial force */
2346             fix2             = _mm_add_pd(fix2,tx);
2347             fiy2             = _mm_add_pd(fiy2,ty);
2348             fiz2             = _mm_add_pd(fiz2,tz);
2349
2350             fjx2             = _mm_add_pd(fjx2,tx);
2351             fjy2             = _mm_add_pd(fjy2,ty);
2352             fjz2             = _mm_add_pd(fjz2,tz);
2353
2354             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2355
2356             /* Inner loop uses 373 flops */
2357         }
2358
2359         /* End of innermost loop */
2360
2361         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2362                                               f+i_coord_offset,fshift+i_shift_offset);
2363
2364         /* Increment number of inner iterations */
2365         inneriter                  += j_index_end - j_index_start;
2366
2367         /* Outer loop uses 18 flops */
2368     }
2369
2370     /* Increment number of outer iterations */
2371     outeriter        += nri;
2372
2373     /* Update outer/inner flops */
2374
2375     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*373);
2376 }