Created SIMD module
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_avx_128_fma_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
50 #include "kernelutil_x86_avx_128_fma_double.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
54  * Electrostatics interaction: CubicSplineTable
55  * VdW interaction:            CubicSplineTable
56  * Geometry:                   Water3-Water3
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_128_fma_double
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB;
77     int              j_coord_offsetA,j_coord_offsetB;
78     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
79     real             rcutoff_scalar;
80     real             *shiftvec,*fshift,*x,*f;
81     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwjidx0A,vdwjidx0B;
89     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     int              vdwjidx1A,vdwjidx1B;
91     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92     int              vdwjidx2A,vdwjidx2B;
93     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
104     real             *charge;
105     int              nvdwtype;
106     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
107     int              *vdwtype;
108     real             *vdwparam;
109     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
110     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
111     __m128i          vfitab;
112     __m128i          ifour       = _mm_set1_epi32(4);
113     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
114     real             *vftab;
115     __m128d          dummy_mask,cutoff_mask;
116     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
117     __m128d          one     = _mm_set1_pd(1.0);
118     __m128d          two     = _mm_set1_pd(2.0);
119     x                = xx[0];
120     f                = ff[0];
121
122     nri              = nlist->nri;
123     iinr             = nlist->iinr;
124     jindex           = nlist->jindex;
125     jjnr             = nlist->jjnr;
126     shiftidx         = nlist->shift;
127     gid              = nlist->gid;
128     shiftvec         = fr->shift_vec[0];
129     fshift           = fr->fshift[0];
130     facel            = _mm_set1_pd(fr->epsfac);
131     charge           = mdatoms->chargeA;
132     nvdwtype         = fr->ntype;
133     vdwparam         = fr->nbfp;
134     vdwtype          = mdatoms->typeA;
135
136     vftab            = kernel_data->table_elec_vdw->data;
137     vftabscale       = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
138
139     /* Setup water-specific parameters */
140     inr              = nlist->iinr[0];
141     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
142     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
143     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
144     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
145
146     jq0              = _mm_set1_pd(charge[inr+0]);
147     jq1              = _mm_set1_pd(charge[inr+1]);
148     jq2              = _mm_set1_pd(charge[inr+2]);
149     vdwjidx0A        = 2*vdwtype[inr+0];
150     qq00             = _mm_mul_pd(iq0,jq0);
151     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
152     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
153     qq01             = _mm_mul_pd(iq0,jq1);
154     qq02             = _mm_mul_pd(iq0,jq2);
155     qq10             = _mm_mul_pd(iq1,jq0);
156     qq11             = _mm_mul_pd(iq1,jq1);
157     qq12             = _mm_mul_pd(iq1,jq2);
158     qq20             = _mm_mul_pd(iq2,jq0);
159     qq21             = _mm_mul_pd(iq2,jq1);
160     qq22             = _mm_mul_pd(iq2,jq2);
161
162     /* Avoid stupid compiler warnings */
163     jnrA = jnrB = 0;
164     j_coord_offsetA = 0;
165     j_coord_offsetB = 0;
166
167     outeriter        = 0;
168     inneriter        = 0;
169
170     /* Start outer loop over neighborlists */
171     for(iidx=0; iidx<nri; iidx++)
172     {
173         /* Load shift vector for this list */
174         i_shift_offset   = DIM*shiftidx[iidx];
175
176         /* Load limits for loop over neighbors */
177         j_index_start    = jindex[iidx];
178         j_index_end      = jindex[iidx+1];
179
180         /* Get outer coordinate index */
181         inr              = iinr[iidx];
182         i_coord_offset   = DIM*inr;
183
184         /* Load i particle coords and add shift vector */
185         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
186                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
187
188         fix0             = _mm_setzero_pd();
189         fiy0             = _mm_setzero_pd();
190         fiz0             = _mm_setzero_pd();
191         fix1             = _mm_setzero_pd();
192         fiy1             = _mm_setzero_pd();
193         fiz1             = _mm_setzero_pd();
194         fix2             = _mm_setzero_pd();
195         fiy2             = _mm_setzero_pd();
196         fiz2             = _mm_setzero_pd();
197
198         /* Reset potential sums */
199         velecsum         = _mm_setzero_pd();
200         vvdwsum          = _mm_setzero_pd();
201
202         /* Start inner kernel loop */
203         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
204         {
205
206             /* Get j neighbor index, and coordinate index */
207             jnrA             = jjnr[jidx];
208             jnrB             = jjnr[jidx+1];
209             j_coord_offsetA  = DIM*jnrA;
210             j_coord_offsetB  = DIM*jnrB;
211
212             /* load j atom coordinates */
213             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
214                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
215
216             /* Calculate displacement vector */
217             dx00             = _mm_sub_pd(ix0,jx0);
218             dy00             = _mm_sub_pd(iy0,jy0);
219             dz00             = _mm_sub_pd(iz0,jz0);
220             dx01             = _mm_sub_pd(ix0,jx1);
221             dy01             = _mm_sub_pd(iy0,jy1);
222             dz01             = _mm_sub_pd(iz0,jz1);
223             dx02             = _mm_sub_pd(ix0,jx2);
224             dy02             = _mm_sub_pd(iy0,jy2);
225             dz02             = _mm_sub_pd(iz0,jz2);
226             dx10             = _mm_sub_pd(ix1,jx0);
227             dy10             = _mm_sub_pd(iy1,jy0);
228             dz10             = _mm_sub_pd(iz1,jz0);
229             dx11             = _mm_sub_pd(ix1,jx1);
230             dy11             = _mm_sub_pd(iy1,jy1);
231             dz11             = _mm_sub_pd(iz1,jz1);
232             dx12             = _mm_sub_pd(ix1,jx2);
233             dy12             = _mm_sub_pd(iy1,jy2);
234             dz12             = _mm_sub_pd(iz1,jz2);
235             dx20             = _mm_sub_pd(ix2,jx0);
236             dy20             = _mm_sub_pd(iy2,jy0);
237             dz20             = _mm_sub_pd(iz2,jz0);
238             dx21             = _mm_sub_pd(ix2,jx1);
239             dy21             = _mm_sub_pd(iy2,jy1);
240             dz21             = _mm_sub_pd(iz2,jz1);
241             dx22             = _mm_sub_pd(ix2,jx2);
242             dy22             = _mm_sub_pd(iy2,jy2);
243             dz22             = _mm_sub_pd(iz2,jz2);
244
245             /* Calculate squared distance and things based on it */
246             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
247             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
248             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
249             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
250             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
251             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
252             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
253             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
254             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
255
256             rinv00           = gmx_mm_invsqrt_pd(rsq00);
257             rinv01           = gmx_mm_invsqrt_pd(rsq01);
258             rinv02           = gmx_mm_invsqrt_pd(rsq02);
259             rinv10           = gmx_mm_invsqrt_pd(rsq10);
260             rinv11           = gmx_mm_invsqrt_pd(rsq11);
261             rinv12           = gmx_mm_invsqrt_pd(rsq12);
262             rinv20           = gmx_mm_invsqrt_pd(rsq20);
263             rinv21           = gmx_mm_invsqrt_pd(rsq21);
264             rinv22           = gmx_mm_invsqrt_pd(rsq22);
265
266             fjx0             = _mm_setzero_pd();
267             fjy0             = _mm_setzero_pd();
268             fjz0             = _mm_setzero_pd();
269             fjx1             = _mm_setzero_pd();
270             fjy1             = _mm_setzero_pd();
271             fjz1             = _mm_setzero_pd();
272             fjx2             = _mm_setzero_pd();
273             fjy2             = _mm_setzero_pd();
274             fjz2             = _mm_setzero_pd();
275
276             /**************************
277              * CALCULATE INTERACTIONS *
278              **************************/
279
280             r00              = _mm_mul_pd(rsq00,rinv00);
281
282             /* Calculate table index by multiplying r with table scale and truncate to integer */
283             rt               = _mm_mul_pd(r00,vftabscale);
284             vfitab           = _mm_cvttpd_epi32(rt);
285 #ifdef __XOP__
286             vfeps            = _mm_frcz_pd(rt);
287 #else
288             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
289 #endif
290             twovfeps         = _mm_add_pd(vfeps,vfeps);
291             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
292
293             /* CUBIC SPLINE TABLE ELECTROSTATICS */
294             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
295             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
296             GMX_MM_TRANSPOSE2_PD(Y,F);
297             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
298             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
299             GMX_MM_TRANSPOSE2_PD(G,H);
300             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
301             VV               = _mm_macc_pd(vfeps,Fp,Y);
302             velec            = _mm_mul_pd(qq00,VV);
303             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
304             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
305
306             /* CUBIC SPLINE TABLE DISPERSION */
307             vfitab           = _mm_add_epi32(vfitab,ifour);
308             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
309             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
310             GMX_MM_TRANSPOSE2_PD(Y,F);
311             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
312             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
313             GMX_MM_TRANSPOSE2_PD(G,H);
314             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
315             VV               = _mm_macc_pd(vfeps,Fp,Y);
316             vvdw6            = _mm_mul_pd(c6_00,VV);
317             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
318             fvdw6            = _mm_mul_pd(c6_00,FF);
319
320             /* CUBIC SPLINE TABLE REPULSION */
321             vfitab           = _mm_add_epi32(vfitab,ifour);
322             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
323             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
324             GMX_MM_TRANSPOSE2_PD(Y,F);
325             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
326             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
327             GMX_MM_TRANSPOSE2_PD(G,H);
328             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
329             VV               = _mm_macc_pd(vfeps,Fp,Y);
330             vvdw12           = _mm_mul_pd(c12_00,VV);
331             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
332             fvdw12           = _mm_mul_pd(c12_00,FF);
333             vvdw             = _mm_add_pd(vvdw12,vvdw6);
334             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
335
336             /* Update potential sum for this i atom from the interaction with this j atom. */
337             velecsum         = _mm_add_pd(velecsum,velec);
338             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
339
340             fscal            = _mm_add_pd(felec,fvdw);
341
342             /* Update vectorial force */
343             fix0             = _mm_macc_pd(dx00,fscal,fix0);
344             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
345             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
346             
347             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
348             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
349             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
350
351             /**************************
352              * CALCULATE INTERACTIONS *
353              **************************/
354
355             r01              = _mm_mul_pd(rsq01,rinv01);
356
357             /* Calculate table index by multiplying r with table scale and truncate to integer */
358             rt               = _mm_mul_pd(r01,vftabscale);
359             vfitab           = _mm_cvttpd_epi32(rt);
360 #ifdef __XOP__
361             vfeps            = _mm_frcz_pd(rt);
362 #else
363             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
364 #endif
365             twovfeps         = _mm_add_pd(vfeps,vfeps);
366             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
367
368             /* CUBIC SPLINE TABLE ELECTROSTATICS */
369             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
370             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
371             GMX_MM_TRANSPOSE2_PD(Y,F);
372             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
373             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
374             GMX_MM_TRANSPOSE2_PD(G,H);
375             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
376             VV               = _mm_macc_pd(vfeps,Fp,Y);
377             velec            = _mm_mul_pd(qq01,VV);
378             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
379             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
380
381             /* Update potential sum for this i atom from the interaction with this j atom. */
382             velecsum         = _mm_add_pd(velecsum,velec);
383
384             fscal            = felec;
385
386             /* Update vectorial force */
387             fix0             = _mm_macc_pd(dx01,fscal,fix0);
388             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
389             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
390             
391             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
392             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
393             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
394
395             /**************************
396              * CALCULATE INTERACTIONS *
397              **************************/
398
399             r02              = _mm_mul_pd(rsq02,rinv02);
400
401             /* Calculate table index by multiplying r with table scale and truncate to integer */
402             rt               = _mm_mul_pd(r02,vftabscale);
403             vfitab           = _mm_cvttpd_epi32(rt);
404 #ifdef __XOP__
405             vfeps            = _mm_frcz_pd(rt);
406 #else
407             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
408 #endif
409             twovfeps         = _mm_add_pd(vfeps,vfeps);
410             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
411
412             /* CUBIC SPLINE TABLE ELECTROSTATICS */
413             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
414             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
415             GMX_MM_TRANSPOSE2_PD(Y,F);
416             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
417             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
418             GMX_MM_TRANSPOSE2_PD(G,H);
419             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
420             VV               = _mm_macc_pd(vfeps,Fp,Y);
421             velec            = _mm_mul_pd(qq02,VV);
422             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
423             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
424
425             /* Update potential sum for this i atom from the interaction with this j atom. */
426             velecsum         = _mm_add_pd(velecsum,velec);
427
428             fscal            = felec;
429
430             /* Update vectorial force */
431             fix0             = _mm_macc_pd(dx02,fscal,fix0);
432             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
433             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
434             
435             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
436             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
437             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
438
439             /**************************
440              * CALCULATE INTERACTIONS *
441              **************************/
442
443             r10              = _mm_mul_pd(rsq10,rinv10);
444
445             /* Calculate table index by multiplying r with table scale and truncate to integer */
446             rt               = _mm_mul_pd(r10,vftabscale);
447             vfitab           = _mm_cvttpd_epi32(rt);
448 #ifdef __XOP__
449             vfeps            = _mm_frcz_pd(rt);
450 #else
451             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
452 #endif
453             twovfeps         = _mm_add_pd(vfeps,vfeps);
454             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
455
456             /* CUBIC SPLINE TABLE ELECTROSTATICS */
457             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
458             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
459             GMX_MM_TRANSPOSE2_PD(Y,F);
460             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
461             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
462             GMX_MM_TRANSPOSE2_PD(G,H);
463             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
464             VV               = _mm_macc_pd(vfeps,Fp,Y);
465             velec            = _mm_mul_pd(qq10,VV);
466             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
467             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
468
469             /* Update potential sum for this i atom from the interaction with this j atom. */
470             velecsum         = _mm_add_pd(velecsum,velec);
471
472             fscal            = felec;
473
474             /* Update vectorial force */
475             fix1             = _mm_macc_pd(dx10,fscal,fix1);
476             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
477             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
478             
479             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
480             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
481             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
482
483             /**************************
484              * CALCULATE INTERACTIONS *
485              **************************/
486
487             r11              = _mm_mul_pd(rsq11,rinv11);
488
489             /* Calculate table index by multiplying r with table scale and truncate to integer */
490             rt               = _mm_mul_pd(r11,vftabscale);
491             vfitab           = _mm_cvttpd_epi32(rt);
492 #ifdef __XOP__
493             vfeps            = _mm_frcz_pd(rt);
494 #else
495             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
496 #endif
497             twovfeps         = _mm_add_pd(vfeps,vfeps);
498             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
499
500             /* CUBIC SPLINE TABLE ELECTROSTATICS */
501             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
502             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
503             GMX_MM_TRANSPOSE2_PD(Y,F);
504             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
505             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
506             GMX_MM_TRANSPOSE2_PD(G,H);
507             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
508             VV               = _mm_macc_pd(vfeps,Fp,Y);
509             velec            = _mm_mul_pd(qq11,VV);
510             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
511             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
512
513             /* Update potential sum for this i atom from the interaction with this j atom. */
514             velecsum         = _mm_add_pd(velecsum,velec);
515
516             fscal            = felec;
517
518             /* Update vectorial force */
519             fix1             = _mm_macc_pd(dx11,fscal,fix1);
520             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
521             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
522             
523             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
524             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
525             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
526
527             /**************************
528              * CALCULATE INTERACTIONS *
529              **************************/
530
531             r12              = _mm_mul_pd(rsq12,rinv12);
532
533             /* Calculate table index by multiplying r with table scale and truncate to integer */
534             rt               = _mm_mul_pd(r12,vftabscale);
535             vfitab           = _mm_cvttpd_epi32(rt);
536 #ifdef __XOP__
537             vfeps            = _mm_frcz_pd(rt);
538 #else
539             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
540 #endif
541             twovfeps         = _mm_add_pd(vfeps,vfeps);
542             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
543
544             /* CUBIC SPLINE TABLE ELECTROSTATICS */
545             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
546             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
547             GMX_MM_TRANSPOSE2_PD(Y,F);
548             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
549             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
550             GMX_MM_TRANSPOSE2_PD(G,H);
551             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
552             VV               = _mm_macc_pd(vfeps,Fp,Y);
553             velec            = _mm_mul_pd(qq12,VV);
554             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
555             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
556
557             /* Update potential sum for this i atom from the interaction with this j atom. */
558             velecsum         = _mm_add_pd(velecsum,velec);
559
560             fscal            = felec;
561
562             /* Update vectorial force */
563             fix1             = _mm_macc_pd(dx12,fscal,fix1);
564             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
565             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
566             
567             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
568             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
569             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
570
571             /**************************
572              * CALCULATE INTERACTIONS *
573              **************************/
574
575             r20              = _mm_mul_pd(rsq20,rinv20);
576
577             /* Calculate table index by multiplying r with table scale and truncate to integer */
578             rt               = _mm_mul_pd(r20,vftabscale);
579             vfitab           = _mm_cvttpd_epi32(rt);
580 #ifdef __XOP__
581             vfeps            = _mm_frcz_pd(rt);
582 #else
583             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
584 #endif
585             twovfeps         = _mm_add_pd(vfeps,vfeps);
586             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
587
588             /* CUBIC SPLINE TABLE ELECTROSTATICS */
589             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
590             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
591             GMX_MM_TRANSPOSE2_PD(Y,F);
592             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
593             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
594             GMX_MM_TRANSPOSE2_PD(G,H);
595             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
596             VV               = _mm_macc_pd(vfeps,Fp,Y);
597             velec            = _mm_mul_pd(qq20,VV);
598             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
599             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
600
601             /* Update potential sum for this i atom from the interaction with this j atom. */
602             velecsum         = _mm_add_pd(velecsum,velec);
603
604             fscal            = felec;
605
606             /* Update vectorial force */
607             fix2             = _mm_macc_pd(dx20,fscal,fix2);
608             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
609             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
610             
611             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
612             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
613             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
614
615             /**************************
616              * CALCULATE INTERACTIONS *
617              **************************/
618
619             r21              = _mm_mul_pd(rsq21,rinv21);
620
621             /* Calculate table index by multiplying r with table scale and truncate to integer */
622             rt               = _mm_mul_pd(r21,vftabscale);
623             vfitab           = _mm_cvttpd_epi32(rt);
624 #ifdef __XOP__
625             vfeps            = _mm_frcz_pd(rt);
626 #else
627             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
628 #endif
629             twovfeps         = _mm_add_pd(vfeps,vfeps);
630             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
631
632             /* CUBIC SPLINE TABLE ELECTROSTATICS */
633             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
634             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
635             GMX_MM_TRANSPOSE2_PD(Y,F);
636             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
637             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
638             GMX_MM_TRANSPOSE2_PD(G,H);
639             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
640             VV               = _mm_macc_pd(vfeps,Fp,Y);
641             velec            = _mm_mul_pd(qq21,VV);
642             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
643             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
644
645             /* Update potential sum for this i atom from the interaction with this j atom. */
646             velecsum         = _mm_add_pd(velecsum,velec);
647
648             fscal            = felec;
649
650             /* Update vectorial force */
651             fix2             = _mm_macc_pd(dx21,fscal,fix2);
652             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
653             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
654             
655             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
656             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
657             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
658
659             /**************************
660              * CALCULATE INTERACTIONS *
661              **************************/
662
663             r22              = _mm_mul_pd(rsq22,rinv22);
664
665             /* Calculate table index by multiplying r with table scale and truncate to integer */
666             rt               = _mm_mul_pd(r22,vftabscale);
667             vfitab           = _mm_cvttpd_epi32(rt);
668 #ifdef __XOP__
669             vfeps            = _mm_frcz_pd(rt);
670 #else
671             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
672 #endif
673             twovfeps         = _mm_add_pd(vfeps,vfeps);
674             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
675
676             /* CUBIC SPLINE TABLE ELECTROSTATICS */
677             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
678             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
679             GMX_MM_TRANSPOSE2_PD(Y,F);
680             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
681             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
682             GMX_MM_TRANSPOSE2_PD(G,H);
683             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
684             VV               = _mm_macc_pd(vfeps,Fp,Y);
685             velec            = _mm_mul_pd(qq22,VV);
686             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
687             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
688
689             /* Update potential sum for this i atom from the interaction with this j atom. */
690             velecsum         = _mm_add_pd(velecsum,velec);
691
692             fscal            = felec;
693
694             /* Update vectorial force */
695             fix2             = _mm_macc_pd(dx22,fscal,fix2);
696             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
697             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
698             
699             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
700             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
701             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
702
703             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
704
705             /* Inner loop uses 444 flops */
706         }
707
708         if(jidx<j_index_end)
709         {
710
711             jnrA             = jjnr[jidx];
712             j_coord_offsetA  = DIM*jnrA;
713
714             /* load j atom coordinates */
715             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
716                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
717
718             /* Calculate displacement vector */
719             dx00             = _mm_sub_pd(ix0,jx0);
720             dy00             = _mm_sub_pd(iy0,jy0);
721             dz00             = _mm_sub_pd(iz0,jz0);
722             dx01             = _mm_sub_pd(ix0,jx1);
723             dy01             = _mm_sub_pd(iy0,jy1);
724             dz01             = _mm_sub_pd(iz0,jz1);
725             dx02             = _mm_sub_pd(ix0,jx2);
726             dy02             = _mm_sub_pd(iy0,jy2);
727             dz02             = _mm_sub_pd(iz0,jz2);
728             dx10             = _mm_sub_pd(ix1,jx0);
729             dy10             = _mm_sub_pd(iy1,jy0);
730             dz10             = _mm_sub_pd(iz1,jz0);
731             dx11             = _mm_sub_pd(ix1,jx1);
732             dy11             = _mm_sub_pd(iy1,jy1);
733             dz11             = _mm_sub_pd(iz1,jz1);
734             dx12             = _mm_sub_pd(ix1,jx2);
735             dy12             = _mm_sub_pd(iy1,jy2);
736             dz12             = _mm_sub_pd(iz1,jz2);
737             dx20             = _mm_sub_pd(ix2,jx0);
738             dy20             = _mm_sub_pd(iy2,jy0);
739             dz20             = _mm_sub_pd(iz2,jz0);
740             dx21             = _mm_sub_pd(ix2,jx1);
741             dy21             = _mm_sub_pd(iy2,jy1);
742             dz21             = _mm_sub_pd(iz2,jz1);
743             dx22             = _mm_sub_pd(ix2,jx2);
744             dy22             = _mm_sub_pd(iy2,jy2);
745             dz22             = _mm_sub_pd(iz2,jz2);
746
747             /* Calculate squared distance and things based on it */
748             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
749             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
750             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
751             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
752             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
753             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
754             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
755             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
756             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
757
758             rinv00           = gmx_mm_invsqrt_pd(rsq00);
759             rinv01           = gmx_mm_invsqrt_pd(rsq01);
760             rinv02           = gmx_mm_invsqrt_pd(rsq02);
761             rinv10           = gmx_mm_invsqrt_pd(rsq10);
762             rinv11           = gmx_mm_invsqrt_pd(rsq11);
763             rinv12           = gmx_mm_invsqrt_pd(rsq12);
764             rinv20           = gmx_mm_invsqrt_pd(rsq20);
765             rinv21           = gmx_mm_invsqrt_pd(rsq21);
766             rinv22           = gmx_mm_invsqrt_pd(rsq22);
767
768             fjx0             = _mm_setzero_pd();
769             fjy0             = _mm_setzero_pd();
770             fjz0             = _mm_setzero_pd();
771             fjx1             = _mm_setzero_pd();
772             fjy1             = _mm_setzero_pd();
773             fjz1             = _mm_setzero_pd();
774             fjx2             = _mm_setzero_pd();
775             fjy2             = _mm_setzero_pd();
776             fjz2             = _mm_setzero_pd();
777
778             /**************************
779              * CALCULATE INTERACTIONS *
780              **************************/
781
782             r00              = _mm_mul_pd(rsq00,rinv00);
783
784             /* Calculate table index by multiplying r with table scale and truncate to integer */
785             rt               = _mm_mul_pd(r00,vftabscale);
786             vfitab           = _mm_cvttpd_epi32(rt);
787 #ifdef __XOP__
788             vfeps            = _mm_frcz_pd(rt);
789 #else
790             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
791 #endif
792             twovfeps         = _mm_add_pd(vfeps,vfeps);
793             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
794
795             /* CUBIC SPLINE TABLE ELECTROSTATICS */
796             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
797             F                = _mm_setzero_pd();
798             GMX_MM_TRANSPOSE2_PD(Y,F);
799             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
800             H                = _mm_setzero_pd();
801             GMX_MM_TRANSPOSE2_PD(G,H);
802             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
803             VV               = _mm_macc_pd(vfeps,Fp,Y);
804             velec            = _mm_mul_pd(qq00,VV);
805             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
806             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
807
808             /* CUBIC SPLINE TABLE DISPERSION */
809             vfitab           = _mm_add_epi32(vfitab,ifour);
810             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
811             F                = _mm_setzero_pd();
812             GMX_MM_TRANSPOSE2_PD(Y,F);
813             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
814             H                = _mm_setzero_pd();
815             GMX_MM_TRANSPOSE2_PD(G,H);
816             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
817             VV               = _mm_macc_pd(vfeps,Fp,Y);
818             vvdw6            = _mm_mul_pd(c6_00,VV);
819             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
820             fvdw6            = _mm_mul_pd(c6_00,FF);
821
822             /* CUBIC SPLINE TABLE REPULSION */
823             vfitab           = _mm_add_epi32(vfitab,ifour);
824             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
825             F                = _mm_setzero_pd();
826             GMX_MM_TRANSPOSE2_PD(Y,F);
827             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
828             H                = _mm_setzero_pd();
829             GMX_MM_TRANSPOSE2_PD(G,H);
830             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
831             VV               = _mm_macc_pd(vfeps,Fp,Y);
832             vvdw12           = _mm_mul_pd(c12_00,VV);
833             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
834             fvdw12           = _mm_mul_pd(c12_00,FF);
835             vvdw             = _mm_add_pd(vvdw12,vvdw6);
836             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
837
838             /* Update potential sum for this i atom from the interaction with this j atom. */
839             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
840             velecsum         = _mm_add_pd(velecsum,velec);
841             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
842             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
843
844             fscal            = _mm_add_pd(felec,fvdw);
845
846             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
847
848             /* Update vectorial force */
849             fix0             = _mm_macc_pd(dx00,fscal,fix0);
850             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
851             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
852             
853             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
854             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
855             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
856
857             /**************************
858              * CALCULATE INTERACTIONS *
859              **************************/
860
861             r01              = _mm_mul_pd(rsq01,rinv01);
862
863             /* Calculate table index by multiplying r with table scale and truncate to integer */
864             rt               = _mm_mul_pd(r01,vftabscale);
865             vfitab           = _mm_cvttpd_epi32(rt);
866 #ifdef __XOP__
867             vfeps            = _mm_frcz_pd(rt);
868 #else
869             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
870 #endif
871             twovfeps         = _mm_add_pd(vfeps,vfeps);
872             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
873
874             /* CUBIC SPLINE TABLE ELECTROSTATICS */
875             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
876             F                = _mm_setzero_pd();
877             GMX_MM_TRANSPOSE2_PD(Y,F);
878             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
879             H                = _mm_setzero_pd();
880             GMX_MM_TRANSPOSE2_PD(G,H);
881             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
882             VV               = _mm_macc_pd(vfeps,Fp,Y);
883             velec            = _mm_mul_pd(qq01,VV);
884             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
885             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
886
887             /* Update potential sum for this i atom from the interaction with this j atom. */
888             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
889             velecsum         = _mm_add_pd(velecsum,velec);
890
891             fscal            = felec;
892
893             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
894
895             /* Update vectorial force */
896             fix0             = _mm_macc_pd(dx01,fscal,fix0);
897             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
898             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
899             
900             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
901             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
902             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
903
904             /**************************
905              * CALCULATE INTERACTIONS *
906              **************************/
907
908             r02              = _mm_mul_pd(rsq02,rinv02);
909
910             /* Calculate table index by multiplying r with table scale and truncate to integer */
911             rt               = _mm_mul_pd(r02,vftabscale);
912             vfitab           = _mm_cvttpd_epi32(rt);
913 #ifdef __XOP__
914             vfeps            = _mm_frcz_pd(rt);
915 #else
916             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
917 #endif
918             twovfeps         = _mm_add_pd(vfeps,vfeps);
919             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
920
921             /* CUBIC SPLINE TABLE ELECTROSTATICS */
922             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
923             F                = _mm_setzero_pd();
924             GMX_MM_TRANSPOSE2_PD(Y,F);
925             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
926             H                = _mm_setzero_pd();
927             GMX_MM_TRANSPOSE2_PD(G,H);
928             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
929             VV               = _mm_macc_pd(vfeps,Fp,Y);
930             velec            = _mm_mul_pd(qq02,VV);
931             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
932             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
933
934             /* Update potential sum for this i atom from the interaction with this j atom. */
935             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
936             velecsum         = _mm_add_pd(velecsum,velec);
937
938             fscal            = felec;
939
940             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
941
942             /* Update vectorial force */
943             fix0             = _mm_macc_pd(dx02,fscal,fix0);
944             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
945             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
946             
947             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
948             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
949             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
950
951             /**************************
952              * CALCULATE INTERACTIONS *
953              **************************/
954
955             r10              = _mm_mul_pd(rsq10,rinv10);
956
957             /* Calculate table index by multiplying r with table scale and truncate to integer */
958             rt               = _mm_mul_pd(r10,vftabscale);
959             vfitab           = _mm_cvttpd_epi32(rt);
960 #ifdef __XOP__
961             vfeps            = _mm_frcz_pd(rt);
962 #else
963             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
964 #endif
965             twovfeps         = _mm_add_pd(vfeps,vfeps);
966             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
967
968             /* CUBIC SPLINE TABLE ELECTROSTATICS */
969             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
970             F                = _mm_setzero_pd();
971             GMX_MM_TRANSPOSE2_PD(Y,F);
972             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
973             H                = _mm_setzero_pd();
974             GMX_MM_TRANSPOSE2_PD(G,H);
975             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
976             VV               = _mm_macc_pd(vfeps,Fp,Y);
977             velec            = _mm_mul_pd(qq10,VV);
978             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
979             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
980
981             /* Update potential sum for this i atom from the interaction with this j atom. */
982             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
983             velecsum         = _mm_add_pd(velecsum,velec);
984
985             fscal            = felec;
986
987             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
988
989             /* Update vectorial force */
990             fix1             = _mm_macc_pd(dx10,fscal,fix1);
991             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
992             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
993             
994             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
995             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
996             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
997
998             /**************************
999              * CALCULATE INTERACTIONS *
1000              **************************/
1001
1002             r11              = _mm_mul_pd(rsq11,rinv11);
1003
1004             /* Calculate table index by multiplying r with table scale and truncate to integer */
1005             rt               = _mm_mul_pd(r11,vftabscale);
1006             vfitab           = _mm_cvttpd_epi32(rt);
1007 #ifdef __XOP__
1008             vfeps            = _mm_frcz_pd(rt);
1009 #else
1010             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1011 #endif
1012             twovfeps         = _mm_add_pd(vfeps,vfeps);
1013             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1014
1015             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1016             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1017             F                = _mm_setzero_pd();
1018             GMX_MM_TRANSPOSE2_PD(Y,F);
1019             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1020             H                = _mm_setzero_pd();
1021             GMX_MM_TRANSPOSE2_PD(G,H);
1022             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1023             VV               = _mm_macc_pd(vfeps,Fp,Y);
1024             velec            = _mm_mul_pd(qq11,VV);
1025             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1026             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1027
1028             /* Update potential sum for this i atom from the interaction with this j atom. */
1029             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1030             velecsum         = _mm_add_pd(velecsum,velec);
1031
1032             fscal            = felec;
1033
1034             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1035
1036             /* Update vectorial force */
1037             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1038             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1039             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1040             
1041             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1042             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1043             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1044
1045             /**************************
1046              * CALCULATE INTERACTIONS *
1047              **************************/
1048
1049             r12              = _mm_mul_pd(rsq12,rinv12);
1050
1051             /* Calculate table index by multiplying r with table scale and truncate to integer */
1052             rt               = _mm_mul_pd(r12,vftabscale);
1053             vfitab           = _mm_cvttpd_epi32(rt);
1054 #ifdef __XOP__
1055             vfeps            = _mm_frcz_pd(rt);
1056 #else
1057             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1058 #endif
1059             twovfeps         = _mm_add_pd(vfeps,vfeps);
1060             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1061
1062             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1063             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1064             F                = _mm_setzero_pd();
1065             GMX_MM_TRANSPOSE2_PD(Y,F);
1066             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1067             H                = _mm_setzero_pd();
1068             GMX_MM_TRANSPOSE2_PD(G,H);
1069             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1070             VV               = _mm_macc_pd(vfeps,Fp,Y);
1071             velec            = _mm_mul_pd(qq12,VV);
1072             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1073             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1074
1075             /* Update potential sum for this i atom from the interaction with this j atom. */
1076             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1077             velecsum         = _mm_add_pd(velecsum,velec);
1078
1079             fscal            = felec;
1080
1081             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1082
1083             /* Update vectorial force */
1084             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1085             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1086             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1087             
1088             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1089             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1090             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1091
1092             /**************************
1093              * CALCULATE INTERACTIONS *
1094              **************************/
1095
1096             r20              = _mm_mul_pd(rsq20,rinv20);
1097
1098             /* Calculate table index by multiplying r with table scale and truncate to integer */
1099             rt               = _mm_mul_pd(r20,vftabscale);
1100             vfitab           = _mm_cvttpd_epi32(rt);
1101 #ifdef __XOP__
1102             vfeps            = _mm_frcz_pd(rt);
1103 #else
1104             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1105 #endif
1106             twovfeps         = _mm_add_pd(vfeps,vfeps);
1107             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1108
1109             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1110             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1111             F                = _mm_setzero_pd();
1112             GMX_MM_TRANSPOSE2_PD(Y,F);
1113             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1114             H                = _mm_setzero_pd();
1115             GMX_MM_TRANSPOSE2_PD(G,H);
1116             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1117             VV               = _mm_macc_pd(vfeps,Fp,Y);
1118             velec            = _mm_mul_pd(qq20,VV);
1119             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1120             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1121
1122             /* Update potential sum for this i atom from the interaction with this j atom. */
1123             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1124             velecsum         = _mm_add_pd(velecsum,velec);
1125
1126             fscal            = felec;
1127
1128             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1129
1130             /* Update vectorial force */
1131             fix2             = _mm_macc_pd(dx20,fscal,fix2);
1132             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
1133             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
1134             
1135             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
1136             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
1137             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
1138
1139             /**************************
1140              * CALCULATE INTERACTIONS *
1141              **************************/
1142
1143             r21              = _mm_mul_pd(rsq21,rinv21);
1144
1145             /* Calculate table index by multiplying r with table scale and truncate to integer */
1146             rt               = _mm_mul_pd(r21,vftabscale);
1147             vfitab           = _mm_cvttpd_epi32(rt);
1148 #ifdef __XOP__
1149             vfeps            = _mm_frcz_pd(rt);
1150 #else
1151             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1152 #endif
1153             twovfeps         = _mm_add_pd(vfeps,vfeps);
1154             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1155
1156             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1157             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1158             F                = _mm_setzero_pd();
1159             GMX_MM_TRANSPOSE2_PD(Y,F);
1160             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1161             H                = _mm_setzero_pd();
1162             GMX_MM_TRANSPOSE2_PD(G,H);
1163             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1164             VV               = _mm_macc_pd(vfeps,Fp,Y);
1165             velec            = _mm_mul_pd(qq21,VV);
1166             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1167             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1168
1169             /* Update potential sum for this i atom from the interaction with this j atom. */
1170             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1171             velecsum         = _mm_add_pd(velecsum,velec);
1172
1173             fscal            = felec;
1174
1175             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1176
1177             /* Update vectorial force */
1178             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1179             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1180             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1181             
1182             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1183             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1184             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1185
1186             /**************************
1187              * CALCULATE INTERACTIONS *
1188              **************************/
1189
1190             r22              = _mm_mul_pd(rsq22,rinv22);
1191
1192             /* Calculate table index by multiplying r with table scale and truncate to integer */
1193             rt               = _mm_mul_pd(r22,vftabscale);
1194             vfitab           = _mm_cvttpd_epi32(rt);
1195 #ifdef __XOP__
1196             vfeps            = _mm_frcz_pd(rt);
1197 #else
1198             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1199 #endif
1200             twovfeps         = _mm_add_pd(vfeps,vfeps);
1201             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1202
1203             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1204             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1205             F                = _mm_setzero_pd();
1206             GMX_MM_TRANSPOSE2_PD(Y,F);
1207             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1208             H                = _mm_setzero_pd();
1209             GMX_MM_TRANSPOSE2_PD(G,H);
1210             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1211             VV               = _mm_macc_pd(vfeps,Fp,Y);
1212             velec            = _mm_mul_pd(qq22,VV);
1213             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1214             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1215
1216             /* Update potential sum for this i atom from the interaction with this j atom. */
1217             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1218             velecsum         = _mm_add_pd(velecsum,velec);
1219
1220             fscal            = felec;
1221
1222             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1223
1224             /* Update vectorial force */
1225             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1226             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1227             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1228             
1229             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1230             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1231             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1232
1233             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1234
1235             /* Inner loop uses 444 flops */
1236         }
1237
1238         /* End of innermost loop */
1239
1240         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1241                                               f+i_coord_offset,fshift+i_shift_offset);
1242
1243         ggid                        = gid[iidx];
1244         /* Update potential energies */
1245         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1246         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1247
1248         /* Increment number of inner iterations */
1249         inneriter                  += j_index_end - j_index_start;
1250
1251         /* Outer loop uses 20 flops */
1252     }
1253
1254     /* Increment number of outer iterations */
1255     outeriter        += nri;
1256
1257     /* Update outer/inner flops */
1258
1259     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*444);
1260 }
1261 /*
1262  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_128_fma_double
1263  * Electrostatics interaction: CubicSplineTable
1264  * VdW interaction:            CubicSplineTable
1265  * Geometry:                   Water3-Water3
1266  * Calculate force/pot:        Force
1267  */
1268 void
1269 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_128_fma_double
1270                     (t_nblist                    * gmx_restrict       nlist,
1271                      rvec                        * gmx_restrict          xx,
1272                      rvec                        * gmx_restrict          ff,
1273                      t_forcerec                  * gmx_restrict          fr,
1274                      t_mdatoms                   * gmx_restrict     mdatoms,
1275                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1276                      t_nrnb                      * gmx_restrict        nrnb)
1277 {
1278     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1279      * just 0 for non-waters.
1280      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1281      * jnr indices corresponding to data put in the four positions in the SIMD register.
1282      */
1283     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1284     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1285     int              jnrA,jnrB;
1286     int              j_coord_offsetA,j_coord_offsetB;
1287     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1288     real             rcutoff_scalar;
1289     real             *shiftvec,*fshift,*x,*f;
1290     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1291     int              vdwioffset0;
1292     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1293     int              vdwioffset1;
1294     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1295     int              vdwioffset2;
1296     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1297     int              vdwjidx0A,vdwjidx0B;
1298     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1299     int              vdwjidx1A,vdwjidx1B;
1300     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1301     int              vdwjidx2A,vdwjidx2B;
1302     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1303     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1304     __m128d          dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1305     __m128d          dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1306     __m128d          dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1307     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1308     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1309     __m128d          dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1310     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1311     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1312     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1313     real             *charge;
1314     int              nvdwtype;
1315     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1316     int              *vdwtype;
1317     real             *vdwparam;
1318     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1319     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1320     __m128i          vfitab;
1321     __m128i          ifour       = _mm_set1_epi32(4);
1322     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
1323     real             *vftab;
1324     __m128d          dummy_mask,cutoff_mask;
1325     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1326     __m128d          one     = _mm_set1_pd(1.0);
1327     __m128d          two     = _mm_set1_pd(2.0);
1328     x                = xx[0];
1329     f                = ff[0];
1330
1331     nri              = nlist->nri;
1332     iinr             = nlist->iinr;
1333     jindex           = nlist->jindex;
1334     jjnr             = nlist->jjnr;
1335     shiftidx         = nlist->shift;
1336     gid              = nlist->gid;
1337     shiftvec         = fr->shift_vec[0];
1338     fshift           = fr->fshift[0];
1339     facel            = _mm_set1_pd(fr->epsfac);
1340     charge           = mdatoms->chargeA;
1341     nvdwtype         = fr->ntype;
1342     vdwparam         = fr->nbfp;
1343     vdwtype          = mdatoms->typeA;
1344
1345     vftab            = kernel_data->table_elec_vdw->data;
1346     vftabscale       = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
1347
1348     /* Setup water-specific parameters */
1349     inr              = nlist->iinr[0];
1350     iq0              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+0]));
1351     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1352     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1353     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1354
1355     jq0              = _mm_set1_pd(charge[inr+0]);
1356     jq1              = _mm_set1_pd(charge[inr+1]);
1357     jq2              = _mm_set1_pd(charge[inr+2]);
1358     vdwjidx0A        = 2*vdwtype[inr+0];
1359     qq00             = _mm_mul_pd(iq0,jq0);
1360     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1361     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1362     qq01             = _mm_mul_pd(iq0,jq1);
1363     qq02             = _mm_mul_pd(iq0,jq2);
1364     qq10             = _mm_mul_pd(iq1,jq0);
1365     qq11             = _mm_mul_pd(iq1,jq1);
1366     qq12             = _mm_mul_pd(iq1,jq2);
1367     qq20             = _mm_mul_pd(iq2,jq0);
1368     qq21             = _mm_mul_pd(iq2,jq1);
1369     qq22             = _mm_mul_pd(iq2,jq2);
1370
1371     /* Avoid stupid compiler warnings */
1372     jnrA = jnrB = 0;
1373     j_coord_offsetA = 0;
1374     j_coord_offsetB = 0;
1375
1376     outeriter        = 0;
1377     inneriter        = 0;
1378
1379     /* Start outer loop over neighborlists */
1380     for(iidx=0; iidx<nri; iidx++)
1381     {
1382         /* Load shift vector for this list */
1383         i_shift_offset   = DIM*shiftidx[iidx];
1384
1385         /* Load limits for loop over neighbors */
1386         j_index_start    = jindex[iidx];
1387         j_index_end      = jindex[iidx+1];
1388
1389         /* Get outer coordinate index */
1390         inr              = iinr[iidx];
1391         i_coord_offset   = DIM*inr;
1392
1393         /* Load i particle coords and add shift vector */
1394         gmx_mm_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1395                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1396
1397         fix0             = _mm_setzero_pd();
1398         fiy0             = _mm_setzero_pd();
1399         fiz0             = _mm_setzero_pd();
1400         fix1             = _mm_setzero_pd();
1401         fiy1             = _mm_setzero_pd();
1402         fiz1             = _mm_setzero_pd();
1403         fix2             = _mm_setzero_pd();
1404         fiy2             = _mm_setzero_pd();
1405         fiz2             = _mm_setzero_pd();
1406
1407         /* Start inner kernel loop */
1408         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1409         {
1410
1411             /* Get j neighbor index, and coordinate index */
1412             jnrA             = jjnr[jidx];
1413             jnrB             = jjnr[jidx+1];
1414             j_coord_offsetA  = DIM*jnrA;
1415             j_coord_offsetB  = DIM*jnrB;
1416
1417             /* load j atom coordinates */
1418             gmx_mm_load_3rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1419                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1420
1421             /* Calculate displacement vector */
1422             dx00             = _mm_sub_pd(ix0,jx0);
1423             dy00             = _mm_sub_pd(iy0,jy0);
1424             dz00             = _mm_sub_pd(iz0,jz0);
1425             dx01             = _mm_sub_pd(ix0,jx1);
1426             dy01             = _mm_sub_pd(iy0,jy1);
1427             dz01             = _mm_sub_pd(iz0,jz1);
1428             dx02             = _mm_sub_pd(ix0,jx2);
1429             dy02             = _mm_sub_pd(iy0,jy2);
1430             dz02             = _mm_sub_pd(iz0,jz2);
1431             dx10             = _mm_sub_pd(ix1,jx0);
1432             dy10             = _mm_sub_pd(iy1,jy0);
1433             dz10             = _mm_sub_pd(iz1,jz0);
1434             dx11             = _mm_sub_pd(ix1,jx1);
1435             dy11             = _mm_sub_pd(iy1,jy1);
1436             dz11             = _mm_sub_pd(iz1,jz1);
1437             dx12             = _mm_sub_pd(ix1,jx2);
1438             dy12             = _mm_sub_pd(iy1,jy2);
1439             dz12             = _mm_sub_pd(iz1,jz2);
1440             dx20             = _mm_sub_pd(ix2,jx0);
1441             dy20             = _mm_sub_pd(iy2,jy0);
1442             dz20             = _mm_sub_pd(iz2,jz0);
1443             dx21             = _mm_sub_pd(ix2,jx1);
1444             dy21             = _mm_sub_pd(iy2,jy1);
1445             dz21             = _mm_sub_pd(iz2,jz1);
1446             dx22             = _mm_sub_pd(ix2,jx2);
1447             dy22             = _mm_sub_pd(iy2,jy2);
1448             dz22             = _mm_sub_pd(iz2,jz2);
1449
1450             /* Calculate squared distance and things based on it */
1451             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1452             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1453             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1454             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1455             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1456             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1457             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1458             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1459             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1460
1461             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1462             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1463             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1464             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1465             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1466             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1467             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1468             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1469             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1470
1471             fjx0             = _mm_setzero_pd();
1472             fjy0             = _mm_setzero_pd();
1473             fjz0             = _mm_setzero_pd();
1474             fjx1             = _mm_setzero_pd();
1475             fjy1             = _mm_setzero_pd();
1476             fjz1             = _mm_setzero_pd();
1477             fjx2             = _mm_setzero_pd();
1478             fjy2             = _mm_setzero_pd();
1479             fjz2             = _mm_setzero_pd();
1480
1481             /**************************
1482              * CALCULATE INTERACTIONS *
1483              **************************/
1484
1485             r00              = _mm_mul_pd(rsq00,rinv00);
1486
1487             /* Calculate table index by multiplying r with table scale and truncate to integer */
1488             rt               = _mm_mul_pd(r00,vftabscale);
1489             vfitab           = _mm_cvttpd_epi32(rt);
1490 #ifdef __XOP__
1491             vfeps            = _mm_frcz_pd(rt);
1492 #else
1493             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1494 #endif
1495             twovfeps         = _mm_add_pd(vfeps,vfeps);
1496             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1497
1498             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1499             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1500             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1501             GMX_MM_TRANSPOSE2_PD(Y,F);
1502             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1503             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1504             GMX_MM_TRANSPOSE2_PD(G,H);
1505             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1506             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1507             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1508
1509             /* CUBIC SPLINE TABLE DISPERSION */
1510             vfitab           = _mm_add_epi32(vfitab,ifour);
1511             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1512             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1513             GMX_MM_TRANSPOSE2_PD(Y,F);
1514             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1515             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1516             GMX_MM_TRANSPOSE2_PD(G,H);
1517             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1518             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1519             fvdw6            = _mm_mul_pd(c6_00,FF);
1520
1521             /* CUBIC SPLINE TABLE REPULSION */
1522             vfitab           = _mm_add_epi32(vfitab,ifour);
1523             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1524             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1525             GMX_MM_TRANSPOSE2_PD(Y,F);
1526             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1527             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1528             GMX_MM_TRANSPOSE2_PD(G,H);
1529             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1530             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1531             fvdw12           = _mm_mul_pd(c12_00,FF);
1532             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1533
1534             fscal            = _mm_add_pd(felec,fvdw);
1535
1536             /* Update vectorial force */
1537             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1538             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1539             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1540             
1541             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1542             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1543             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1544
1545             /**************************
1546              * CALCULATE INTERACTIONS *
1547              **************************/
1548
1549             r01              = _mm_mul_pd(rsq01,rinv01);
1550
1551             /* Calculate table index by multiplying r with table scale and truncate to integer */
1552             rt               = _mm_mul_pd(r01,vftabscale);
1553             vfitab           = _mm_cvttpd_epi32(rt);
1554 #ifdef __XOP__
1555             vfeps            = _mm_frcz_pd(rt);
1556 #else
1557             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1558 #endif
1559             twovfeps         = _mm_add_pd(vfeps,vfeps);
1560             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1561
1562             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1563             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1564             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1565             GMX_MM_TRANSPOSE2_PD(Y,F);
1566             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1567             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1568             GMX_MM_TRANSPOSE2_PD(G,H);
1569             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1570             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1571             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
1572
1573             fscal            = felec;
1574
1575             /* Update vectorial force */
1576             fix0             = _mm_macc_pd(dx01,fscal,fix0);
1577             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
1578             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
1579             
1580             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
1581             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
1582             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
1583
1584             /**************************
1585              * CALCULATE INTERACTIONS *
1586              **************************/
1587
1588             r02              = _mm_mul_pd(rsq02,rinv02);
1589
1590             /* Calculate table index by multiplying r with table scale and truncate to integer */
1591             rt               = _mm_mul_pd(r02,vftabscale);
1592             vfitab           = _mm_cvttpd_epi32(rt);
1593 #ifdef __XOP__
1594             vfeps            = _mm_frcz_pd(rt);
1595 #else
1596             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1597 #endif
1598             twovfeps         = _mm_add_pd(vfeps,vfeps);
1599             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1600
1601             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1602             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1603             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1604             GMX_MM_TRANSPOSE2_PD(Y,F);
1605             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1606             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1607             GMX_MM_TRANSPOSE2_PD(G,H);
1608             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1609             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1610             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
1611
1612             fscal            = felec;
1613
1614             /* Update vectorial force */
1615             fix0             = _mm_macc_pd(dx02,fscal,fix0);
1616             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
1617             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
1618             
1619             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
1620             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
1621             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
1622
1623             /**************************
1624              * CALCULATE INTERACTIONS *
1625              **************************/
1626
1627             r10              = _mm_mul_pd(rsq10,rinv10);
1628
1629             /* Calculate table index by multiplying r with table scale and truncate to integer */
1630             rt               = _mm_mul_pd(r10,vftabscale);
1631             vfitab           = _mm_cvttpd_epi32(rt);
1632 #ifdef __XOP__
1633             vfeps            = _mm_frcz_pd(rt);
1634 #else
1635             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1636 #endif
1637             twovfeps         = _mm_add_pd(vfeps,vfeps);
1638             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1639
1640             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1641             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1642             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1643             GMX_MM_TRANSPOSE2_PD(Y,F);
1644             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1645             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1646             GMX_MM_TRANSPOSE2_PD(G,H);
1647             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1648             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1649             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
1650
1651             fscal            = felec;
1652
1653             /* Update vectorial force */
1654             fix1             = _mm_macc_pd(dx10,fscal,fix1);
1655             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
1656             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
1657             
1658             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
1659             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
1660             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
1661
1662             /**************************
1663              * CALCULATE INTERACTIONS *
1664              **************************/
1665
1666             r11              = _mm_mul_pd(rsq11,rinv11);
1667
1668             /* Calculate table index by multiplying r with table scale and truncate to integer */
1669             rt               = _mm_mul_pd(r11,vftabscale);
1670             vfitab           = _mm_cvttpd_epi32(rt);
1671 #ifdef __XOP__
1672             vfeps            = _mm_frcz_pd(rt);
1673 #else
1674             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1675 #endif
1676             twovfeps         = _mm_add_pd(vfeps,vfeps);
1677             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1678
1679             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1680             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1681             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1682             GMX_MM_TRANSPOSE2_PD(Y,F);
1683             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1684             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1685             GMX_MM_TRANSPOSE2_PD(G,H);
1686             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1687             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1688             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1689
1690             fscal            = felec;
1691
1692             /* Update vectorial force */
1693             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1694             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1695             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1696             
1697             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1698             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1699             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1700
1701             /**************************
1702              * CALCULATE INTERACTIONS *
1703              **************************/
1704
1705             r12              = _mm_mul_pd(rsq12,rinv12);
1706
1707             /* Calculate table index by multiplying r with table scale and truncate to integer */
1708             rt               = _mm_mul_pd(r12,vftabscale);
1709             vfitab           = _mm_cvttpd_epi32(rt);
1710 #ifdef __XOP__
1711             vfeps            = _mm_frcz_pd(rt);
1712 #else
1713             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1714 #endif
1715             twovfeps         = _mm_add_pd(vfeps,vfeps);
1716             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1717
1718             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1719             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1720             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1721             GMX_MM_TRANSPOSE2_PD(Y,F);
1722             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1723             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1724             GMX_MM_TRANSPOSE2_PD(G,H);
1725             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1726             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1727             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1728
1729             fscal            = felec;
1730
1731             /* Update vectorial force */
1732             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1733             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1734             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1735             
1736             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1737             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1738             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1739
1740             /**************************
1741              * CALCULATE INTERACTIONS *
1742              **************************/
1743
1744             r20              = _mm_mul_pd(rsq20,rinv20);
1745
1746             /* Calculate table index by multiplying r with table scale and truncate to integer */
1747             rt               = _mm_mul_pd(r20,vftabscale);
1748             vfitab           = _mm_cvttpd_epi32(rt);
1749 #ifdef __XOP__
1750             vfeps            = _mm_frcz_pd(rt);
1751 #else
1752             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1753 #endif
1754             twovfeps         = _mm_add_pd(vfeps,vfeps);
1755             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1756
1757             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1758             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1759             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1760             GMX_MM_TRANSPOSE2_PD(Y,F);
1761             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1762             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1763             GMX_MM_TRANSPOSE2_PD(G,H);
1764             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1765             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1766             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
1767
1768             fscal            = felec;
1769
1770             /* Update vectorial force */
1771             fix2             = _mm_macc_pd(dx20,fscal,fix2);
1772             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
1773             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
1774             
1775             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
1776             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
1777             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
1778
1779             /**************************
1780              * CALCULATE INTERACTIONS *
1781              **************************/
1782
1783             r21              = _mm_mul_pd(rsq21,rinv21);
1784
1785             /* Calculate table index by multiplying r with table scale and truncate to integer */
1786             rt               = _mm_mul_pd(r21,vftabscale);
1787             vfitab           = _mm_cvttpd_epi32(rt);
1788 #ifdef __XOP__
1789             vfeps            = _mm_frcz_pd(rt);
1790 #else
1791             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1792 #endif
1793             twovfeps         = _mm_add_pd(vfeps,vfeps);
1794             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1795
1796             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1797             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1798             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1799             GMX_MM_TRANSPOSE2_PD(Y,F);
1800             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1801             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1802             GMX_MM_TRANSPOSE2_PD(G,H);
1803             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1804             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1805             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1806
1807             fscal            = felec;
1808
1809             /* Update vectorial force */
1810             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1811             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1812             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1813             
1814             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1815             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1816             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1817
1818             /**************************
1819              * CALCULATE INTERACTIONS *
1820              **************************/
1821
1822             r22              = _mm_mul_pd(rsq22,rinv22);
1823
1824             /* Calculate table index by multiplying r with table scale and truncate to integer */
1825             rt               = _mm_mul_pd(r22,vftabscale);
1826             vfitab           = _mm_cvttpd_epi32(rt);
1827 #ifdef __XOP__
1828             vfeps            = _mm_frcz_pd(rt);
1829 #else
1830             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1831 #endif
1832             twovfeps         = _mm_add_pd(vfeps,vfeps);
1833             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1834
1835             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1836             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1837             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1838             GMX_MM_TRANSPOSE2_PD(Y,F);
1839             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1840             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1841             GMX_MM_TRANSPOSE2_PD(G,H);
1842             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1843             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1844             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1845
1846             fscal            = felec;
1847
1848             /* Update vectorial force */
1849             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1850             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1851             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1852             
1853             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1854             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1855             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1856
1857             gmx_mm_decrement_3rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1858
1859             /* Inner loop uses 400 flops */
1860         }
1861
1862         if(jidx<j_index_end)
1863         {
1864
1865             jnrA             = jjnr[jidx];
1866             j_coord_offsetA  = DIM*jnrA;
1867
1868             /* load j atom coordinates */
1869             gmx_mm_load_3rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1870                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1871
1872             /* Calculate displacement vector */
1873             dx00             = _mm_sub_pd(ix0,jx0);
1874             dy00             = _mm_sub_pd(iy0,jy0);
1875             dz00             = _mm_sub_pd(iz0,jz0);
1876             dx01             = _mm_sub_pd(ix0,jx1);
1877             dy01             = _mm_sub_pd(iy0,jy1);
1878             dz01             = _mm_sub_pd(iz0,jz1);
1879             dx02             = _mm_sub_pd(ix0,jx2);
1880             dy02             = _mm_sub_pd(iy0,jy2);
1881             dz02             = _mm_sub_pd(iz0,jz2);
1882             dx10             = _mm_sub_pd(ix1,jx0);
1883             dy10             = _mm_sub_pd(iy1,jy0);
1884             dz10             = _mm_sub_pd(iz1,jz0);
1885             dx11             = _mm_sub_pd(ix1,jx1);
1886             dy11             = _mm_sub_pd(iy1,jy1);
1887             dz11             = _mm_sub_pd(iz1,jz1);
1888             dx12             = _mm_sub_pd(ix1,jx2);
1889             dy12             = _mm_sub_pd(iy1,jy2);
1890             dz12             = _mm_sub_pd(iz1,jz2);
1891             dx20             = _mm_sub_pd(ix2,jx0);
1892             dy20             = _mm_sub_pd(iy2,jy0);
1893             dz20             = _mm_sub_pd(iz2,jz0);
1894             dx21             = _mm_sub_pd(ix2,jx1);
1895             dy21             = _mm_sub_pd(iy2,jy1);
1896             dz21             = _mm_sub_pd(iz2,jz1);
1897             dx22             = _mm_sub_pd(ix2,jx2);
1898             dy22             = _mm_sub_pd(iy2,jy2);
1899             dz22             = _mm_sub_pd(iz2,jz2);
1900
1901             /* Calculate squared distance and things based on it */
1902             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1903             rsq01            = gmx_mm_calc_rsq_pd(dx01,dy01,dz01);
1904             rsq02            = gmx_mm_calc_rsq_pd(dx02,dy02,dz02);
1905             rsq10            = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1906             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1907             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1908             rsq20            = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1909             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1910             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1911
1912             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1913             rinv01           = gmx_mm_invsqrt_pd(rsq01);
1914             rinv02           = gmx_mm_invsqrt_pd(rsq02);
1915             rinv10           = gmx_mm_invsqrt_pd(rsq10);
1916             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1917             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1918             rinv20           = gmx_mm_invsqrt_pd(rsq20);
1919             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1920             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1921
1922             fjx0             = _mm_setzero_pd();
1923             fjy0             = _mm_setzero_pd();
1924             fjz0             = _mm_setzero_pd();
1925             fjx1             = _mm_setzero_pd();
1926             fjy1             = _mm_setzero_pd();
1927             fjz1             = _mm_setzero_pd();
1928             fjx2             = _mm_setzero_pd();
1929             fjy2             = _mm_setzero_pd();
1930             fjz2             = _mm_setzero_pd();
1931
1932             /**************************
1933              * CALCULATE INTERACTIONS *
1934              **************************/
1935
1936             r00              = _mm_mul_pd(rsq00,rinv00);
1937
1938             /* Calculate table index by multiplying r with table scale and truncate to integer */
1939             rt               = _mm_mul_pd(r00,vftabscale);
1940             vfitab           = _mm_cvttpd_epi32(rt);
1941 #ifdef __XOP__
1942             vfeps            = _mm_frcz_pd(rt);
1943 #else
1944             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1945 #endif
1946             twovfeps         = _mm_add_pd(vfeps,vfeps);
1947             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1948
1949             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1950             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1951             F                = _mm_setzero_pd();
1952             GMX_MM_TRANSPOSE2_PD(Y,F);
1953             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1954             H                = _mm_setzero_pd();
1955             GMX_MM_TRANSPOSE2_PD(G,H);
1956             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1957             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1958             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq00,FF),_mm_mul_pd(vftabscale,rinv00)));
1959
1960             /* CUBIC SPLINE TABLE DISPERSION */
1961             vfitab           = _mm_add_epi32(vfitab,ifour);
1962             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1963             F                = _mm_setzero_pd();
1964             GMX_MM_TRANSPOSE2_PD(Y,F);
1965             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1966             H                = _mm_setzero_pd();
1967             GMX_MM_TRANSPOSE2_PD(G,H);
1968             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1969             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1970             fvdw6            = _mm_mul_pd(c6_00,FF);
1971
1972             /* CUBIC SPLINE TABLE REPULSION */
1973             vfitab           = _mm_add_epi32(vfitab,ifour);
1974             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1975             F                = _mm_setzero_pd();
1976             GMX_MM_TRANSPOSE2_PD(Y,F);
1977             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1978             H                = _mm_setzero_pd();
1979             GMX_MM_TRANSPOSE2_PD(G,H);
1980             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1981             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1982             fvdw12           = _mm_mul_pd(c12_00,FF);
1983             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1984
1985             fscal            = _mm_add_pd(felec,fvdw);
1986
1987             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1988
1989             /* Update vectorial force */
1990             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1991             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1992             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1993             
1994             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1995             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1996             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1997
1998             /**************************
1999              * CALCULATE INTERACTIONS *
2000              **************************/
2001
2002             r01              = _mm_mul_pd(rsq01,rinv01);
2003
2004             /* Calculate table index by multiplying r with table scale and truncate to integer */
2005             rt               = _mm_mul_pd(r01,vftabscale);
2006             vfitab           = _mm_cvttpd_epi32(rt);
2007 #ifdef __XOP__
2008             vfeps            = _mm_frcz_pd(rt);
2009 #else
2010             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2011 #endif
2012             twovfeps         = _mm_add_pd(vfeps,vfeps);
2013             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2014
2015             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2016             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2017             F                = _mm_setzero_pd();
2018             GMX_MM_TRANSPOSE2_PD(Y,F);
2019             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2020             H                = _mm_setzero_pd();
2021             GMX_MM_TRANSPOSE2_PD(G,H);
2022             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2023             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2024             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq01,FF),_mm_mul_pd(vftabscale,rinv01)));
2025
2026             fscal            = felec;
2027
2028             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2029
2030             /* Update vectorial force */
2031             fix0             = _mm_macc_pd(dx01,fscal,fix0);
2032             fiy0             = _mm_macc_pd(dy01,fscal,fiy0);
2033             fiz0             = _mm_macc_pd(dz01,fscal,fiz0);
2034             
2035             fjx1             = _mm_macc_pd(dx01,fscal,fjx1);
2036             fjy1             = _mm_macc_pd(dy01,fscal,fjy1);
2037             fjz1             = _mm_macc_pd(dz01,fscal,fjz1);
2038
2039             /**************************
2040              * CALCULATE INTERACTIONS *
2041              **************************/
2042
2043             r02              = _mm_mul_pd(rsq02,rinv02);
2044
2045             /* Calculate table index by multiplying r with table scale and truncate to integer */
2046             rt               = _mm_mul_pd(r02,vftabscale);
2047             vfitab           = _mm_cvttpd_epi32(rt);
2048 #ifdef __XOP__
2049             vfeps            = _mm_frcz_pd(rt);
2050 #else
2051             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2052 #endif
2053             twovfeps         = _mm_add_pd(vfeps,vfeps);
2054             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2055
2056             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2057             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2058             F                = _mm_setzero_pd();
2059             GMX_MM_TRANSPOSE2_PD(Y,F);
2060             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2061             H                = _mm_setzero_pd();
2062             GMX_MM_TRANSPOSE2_PD(G,H);
2063             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2064             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2065             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq02,FF),_mm_mul_pd(vftabscale,rinv02)));
2066
2067             fscal            = felec;
2068
2069             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2070
2071             /* Update vectorial force */
2072             fix0             = _mm_macc_pd(dx02,fscal,fix0);
2073             fiy0             = _mm_macc_pd(dy02,fscal,fiy0);
2074             fiz0             = _mm_macc_pd(dz02,fscal,fiz0);
2075             
2076             fjx2             = _mm_macc_pd(dx02,fscal,fjx2);
2077             fjy2             = _mm_macc_pd(dy02,fscal,fjy2);
2078             fjz2             = _mm_macc_pd(dz02,fscal,fjz2);
2079
2080             /**************************
2081              * CALCULATE INTERACTIONS *
2082              **************************/
2083
2084             r10              = _mm_mul_pd(rsq10,rinv10);
2085
2086             /* Calculate table index by multiplying r with table scale and truncate to integer */
2087             rt               = _mm_mul_pd(r10,vftabscale);
2088             vfitab           = _mm_cvttpd_epi32(rt);
2089 #ifdef __XOP__
2090             vfeps            = _mm_frcz_pd(rt);
2091 #else
2092             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2093 #endif
2094             twovfeps         = _mm_add_pd(vfeps,vfeps);
2095             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2096
2097             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2098             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2099             F                = _mm_setzero_pd();
2100             GMX_MM_TRANSPOSE2_PD(Y,F);
2101             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2102             H                = _mm_setzero_pd();
2103             GMX_MM_TRANSPOSE2_PD(G,H);
2104             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2105             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2106             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq10,FF),_mm_mul_pd(vftabscale,rinv10)));
2107
2108             fscal            = felec;
2109
2110             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2111
2112             /* Update vectorial force */
2113             fix1             = _mm_macc_pd(dx10,fscal,fix1);
2114             fiy1             = _mm_macc_pd(dy10,fscal,fiy1);
2115             fiz1             = _mm_macc_pd(dz10,fscal,fiz1);
2116             
2117             fjx0             = _mm_macc_pd(dx10,fscal,fjx0);
2118             fjy0             = _mm_macc_pd(dy10,fscal,fjy0);
2119             fjz0             = _mm_macc_pd(dz10,fscal,fjz0);
2120
2121             /**************************
2122              * CALCULATE INTERACTIONS *
2123              **************************/
2124
2125             r11              = _mm_mul_pd(rsq11,rinv11);
2126
2127             /* Calculate table index by multiplying r with table scale and truncate to integer */
2128             rt               = _mm_mul_pd(r11,vftabscale);
2129             vfitab           = _mm_cvttpd_epi32(rt);
2130 #ifdef __XOP__
2131             vfeps            = _mm_frcz_pd(rt);
2132 #else
2133             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2134 #endif
2135             twovfeps         = _mm_add_pd(vfeps,vfeps);
2136             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2137
2138             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2139             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2140             F                = _mm_setzero_pd();
2141             GMX_MM_TRANSPOSE2_PD(Y,F);
2142             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2143             H                = _mm_setzero_pd();
2144             GMX_MM_TRANSPOSE2_PD(G,H);
2145             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2146             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2147             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2148
2149             fscal            = felec;
2150
2151             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2152
2153             /* Update vectorial force */
2154             fix1             = _mm_macc_pd(dx11,fscal,fix1);
2155             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
2156             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
2157             
2158             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
2159             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
2160             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
2161
2162             /**************************
2163              * CALCULATE INTERACTIONS *
2164              **************************/
2165
2166             r12              = _mm_mul_pd(rsq12,rinv12);
2167
2168             /* Calculate table index by multiplying r with table scale and truncate to integer */
2169             rt               = _mm_mul_pd(r12,vftabscale);
2170             vfitab           = _mm_cvttpd_epi32(rt);
2171 #ifdef __XOP__
2172             vfeps            = _mm_frcz_pd(rt);
2173 #else
2174             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2175 #endif
2176             twovfeps         = _mm_add_pd(vfeps,vfeps);
2177             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2178
2179             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2180             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2181             F                = _mm_setzero_pd();
2182             GMX_MM_TRANSPOSE2_PD(Y,F);
2183             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2184             H                = _mm_setzero_pd();
2185             GMX_MM_TRANSPOSE2_PD(G,H);
2186             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2187             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2188             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2189
2190             fscal            = felec;
2191
2192             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2193
2194             /* Update vectorial force */
2195             fix1             = _mm_macc_pd(dx12,fscal,fix1);
2196             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
2197             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
2198             
2199             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
2200             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
2201             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
2202
2203             /**************************
2204              * CALCULATE INTERACTIONS *
2205              **************************/
2206
2207             r20              = _mm_mul_pd(rsq20,rinv20);
2208
2209             /* Calculate table index by multiplying r with table scale and truncate to integer */
2210             rt               = _mm_mul_pd(r20,vftabscale);
2211             vfitab           = _mm_cvttpd_epi32(rt);
2212 #ifdef __XOP__
2213             vfeps            = _mm_frcz_pd(rt);
2214 #else
2215             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2216 #endif
2217             twovfeps         = _mm_add_pd(vfeps,vfeps);
2218             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2219
2220             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2221             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2222             F                = _mm_setzero_pd();
2223             GMX_MM_TRANSPOSE2_PD(Y,F);
2224             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2225             H                = _mm_setzero_pd();
2226             GMX_MM_TRANSPOSE2_PD(G,H);
2227             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2228             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2229             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq20,FF),_mm_mul_pd(vftabscale,rinv20)));
2230
2231             fscal            = felec;
2232
2233             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2234
2235             /* Update vectorial force */
2236             fix2             = _mm_macc_pd(dx20,fscal,fix2);
2237             fiy2             = _mm_macc_pd(dy20,fscal,fiy2);
2238             fiz2             = _mm_macc_pd(dz20,fscal,fiz2);
2239             
2240             fjx0             = _mm_macc_pd(dx20,fscal,fjx0);
2241             fjy0             = _mm_macc_pd(dy20,fscal,fjy0);
2242             fjz0             = _mm_macc_pd(dz20,fscal,fjz0);
2243
2244             /**************************
2245              * CALCULATE INTERACTIONS *
2246              **************************/
2247
2248             r21              = _mm_mul_pd(rsq21,rinv21);
2249
2250             /* Calculate table index by multiplying r with table scale and truncate to integer */
2251             rt               = _mm_mul_pd(r21,vftabscale);
2252             vfitab           = _mm_cvttpd_epi32(rt);
2253 #ifdef __XOP__
2254             vfeps            = _mm_frcz_pd(rt);
2255 #else
2256             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2257 #endif
2258             twovfeps         = _mm_add_pd(vfeps,vfeps);
2259             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2260
2261             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2262             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2263             F                = _mm_setzero_pd();
2264             GMX_MM_TRANSPOSE2_PD(Y,F);
2265             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2266             H                = _mm_setzero_pd();
2267             GMX_MM_TRANSPOSE2_PD(G,H);
2268             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2269             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2270             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2271
2272             fscal            = felec;
2273
2274             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2275
2276             /* Update vectorial force */
2277             fix2             = _mm_macc_pd(dx21,fscal,fix2);
2278             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
2279             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
2280             
2281             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
2282             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
2283             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
2284
2285             /**************************
2286              * CALCULATE INTERACTIONS *
2287              **************************/
2288
2289             r22              = _mm_mul_pd(rsq22,rinv22);
2290
2291             /* Calculate table index by multiplying r with table scale and truncate to integer */
2292             rt               = _mm_mul_pd(r22,vftabscale);
2293             vfitab           = _mm_cvttpd_epi32(rt);
2294 #ifdef __XOP__
2295             vfeps            = _mm_frcz_pd(rt);
2296 #else
2297             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2298 #endif
2299             twovfeps         = _mm_add_pd(vfeps,vfeps);
2300             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2301
2302             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2303             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2304             F                = _mm_setzero_pd();
2305             GMX_MM_TRANSPOSE2_PD(Y,F);
2306             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2307             H                = _mm_setzero_pd();
2308             GMX_MM_TRANSPOSE2_PD(G,H);
2309             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2310             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2311             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2312
2313             fscal            = felec;
2314
2315             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2316
2317             /* Update vectorial force */
2318             fix2             = _mm_macc_pd(dx22,fscal,fix2);
2319             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
2320             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
2321             
2322             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
2323             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
2324             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
2325
2326             gmx_mm_decrement_3rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2327
2328             /* Inner loop uses 400 flops */
2329         }
2330
2331         /* End of innermost loop */
2332
2333         gmx_mm_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2334                                               f+i_coord_offset,fshift+i_shift_offset);
2335
2336         /* Increment number of inner iterations */
2337         inneriter                  += j_index_end - j_index_start;
2338
2339         /* Outer loop uses 18 flops */
2340     }
2341
2342     /* Increment number of outer iterations */
2343     outeriter        += nri;
2344
2345     /* Update outer/inner flops */
2346
2347     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*400);
2348 }