Added option to gmx nmeig to print ZPE.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_avx_128_fma_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_128_fma_double.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_double
51  * Electrostatics interaction: CubicSplineTable
52  * VdW interaction:            CubicSplineTable
53  * Geometry:                   Water4-Water4
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_double
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67      * just 0 for non-waters.
68      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB;
74     int              j_coord_offsetA,j_coord_offsetB;
75     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
76     real             rcutoff_scalar;
77     real             *shiftvec,*fshift,*x,*f;
78     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
79     int              vdwioffset0;
80     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
81     int              vdwioffset1;
82     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
83     int              vdwioffset2;
84     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
85     int              vdwioffset3;
86     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
87     int              vdwjidx0A,vdwjidx0B;
88     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89     int              vdwjidx1A,vdwjidx1B;
90     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
91     int              vdwjidx2A,vdwjidx2B;
92     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
93     int              vdwjidx3A,vdwjidx3B;
94     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
95     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
97     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
98     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
99     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
100     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
101     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
102     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
103     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
104     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
105     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
106     real             *charge;
107     int              nvdwtype;
108     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
109     int              *vdwtype;
110     real             *vdwparam;
111     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
112     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
113     __m128i          vfitab;
114     __m128i          ifour       = _mm_set1_epi32(4);
115     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
116     real             *vftab;
117     __m128d          dummy_mask,cutoff_mask;
118     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
119     __m128d          one     = _mm_set1_pd(1.0);
120     __m128d          two     = _mm_set1_pd(2.0);
121     x                = xx[0];
122     f                = ff[0];
123
124     nri              = nlist->nri;
125     iinr             = nlist->iinr;
126     jindex           = nlist->jindex;
127     jjnr             = nlist->jjnr;
128     shiftidx         = nlist->shift;
129     gid              = nlist->gid;
130     shiftvec         = fr->shift_vec[0];
131     fshift           = fr->fshift[0];
132     facel            = _mm_set1_pd(fr->ic->epsfac);
133     charge           = mdatoms->chargeA;
134     nvdwtype         = fr->ntype;
135     vdwparam         = fr->nbfp;
136     vdwtype          = mdatoms->typeA;
137
138     vftab            = kernel_data->table_elec_vdw->data;
139     vftabscale       = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
140
141     /* Setup water-specific parameters */
142     inr              = nlist->iinr[0];
143     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
144     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
145     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
146     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
147
148     jq1              = _mm_set1_pd(charge[inr+1]);
149     jq2              = _mm_set1_pd(charge[inr+2]);
150     jq3              = _mm_set1_pd(charge[inr+3]);
151     vdwjidx0A        = 2*vdwtype[inr+0];
152     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
153     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
154     qq11             = _mm_mul_pd(iq1,jq1);
155     qq12             = _mm_mul_pd(iq1,jq2);
156     qq13             = _mm_mul_pd(iq1,jq3);
157     qq21             = _mm_mul_pd(iq2,jq1);
158     qq22             = _mm_mul_pd(iq2,jq2);
159     qq23             = _mm_mul_pd(iq2,jq3);
160     qq31             = _mm_mul_pd(iq3,jq1);
161     qq32             = _mm_mul_pd(iq3,jq2);
162     qq33             = _mm_mul_pd(iq3,jq3);
163
164     /* Avoid stupid compiler warnings */
165     jnrA = jnrB = 0;
166     j_coord_offsetA = 0;
167     j_coord_offsetB = 0;
168
169     outeriter        = 0;
170     inneriter        = 0;
171
172     /* Start outer loop over neighborlists */
173     for(iidx=0; iidx<nri; iidx++)
174     {
175         /* Load shift vector for this list */
176         i_shift_offset   = DIM*shiftidx[iidx];
177
178         /* Load limits for loop over neighbors */
179         j_index_start    = jindex[iidx];
180         j_index_end      = jindex[iidx+1];
181
182         /* Get outer coordinate index */
183         inr              = iinr[iidx];
184         i_coord_offset   = DIM*inr;
185
186         /* Load i particle coords and add shift vector */
187         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
188                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
189
190         fix0             = _mm_setzero_pd();
191         fiy0             = _mm_setzero_pd();
192         fiz0             = _mm_setzero_pd();
193         fix1             = _mm_setzero_pd();
194         fiy1             = _mm_setzero_pd();
195         fiz1             = _mm_setzero_pd();
196         fix2             = _mm_setzero_pd();
197         fiy2             = _mm_setzero_pd();
198         fiz2             = _mm_setzero_pd();
199         fix3             = _mm_setzero_pd();
200         fiy3             = _mm_setzero_pd();
201         fiz3             = _mm_setzero_pd();
202
203         /* Reset potential sums */
204         velecsum         = _mm_setzero_pd();
205         vvdwsum          = _mm_setzero_pd();
206
207         /* Start inner kernel loop */
208         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
209         {
210
211             /* Get j neighbor index, and coordinate index */
212             jnrA             = jjnr[jidx];
213             jnrB             = jjnr[jidx+1];
214             j_coord_offsetA  = DIM*jnrA;
215             j_coord_offsetB  = DIM*jnrB;
216
217             /* load j atom coordinates */
218             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
219                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
220                                               &jy2,&jz2,&jx3,&jy3,&jz3);
221
222             /* Calculate displacement vector */
223             dx00             = _mm_sub_pd(ix0,jx0);
224             dy00             = _mm_sub_pd(iy0,jy0);
225             dz00             = _mm_sub_pd(iz0,jz0);
226             dx11             = _mm_sub_pd(ix1,jx1);
227             dy11             = _mm_sub_pd(iy1,jy1);
228             dz11             = _mm_sub_pd(iz1,jz1);
229             dx12             = _mm_sub_pd(ix1,jx2);
230             dy12             = _mm_sub_pd(iy1,jy2);
231             dz12             = _mm_sub_pd(iz1,jz2);
232             dx13             = _mm_sub_pd(ix1,jx3);
233             dy13             = _mm_sub_pd(iy1,jy3);
234             dz13             = _mm_sub_pd(iz1,jz3);
235             dx21             = _mm_sub_pd(ix2,jx1);
236             dy21             = _mm_sub_pd(iy2,jy1);
237             dz21             = _mm_sub_pd(iz2,jz1);
238             dx22             = _mm_sub_pd(ix2,jx2);
239             dy22             = _mm_sub_pd(iy2,jy2);
240             dz22             = _mm_sub_pd(iz2,jz2);
241             dx23             = _mm_sub_pd(ix2,jx3);
242             dy23             = _mm_sub_pd(iy2,jy3);
243             dz23             = _mm_sub_pd(iz2,jz3);
244             dx31             = _mm_sub_pd(ix3,jx1);
245             dy31             = _mm_sub_pd(iy3,jy1);
246             dz31             = _mm_sub_pd(iz3,jz1);
247             dx32             = _mm_sub_pd(ix3,jx2);
248             dy32             = _mm_sub_pd(iy3,jy2);
249             dz32             = _mm_sub_pd(iz3,jz2);
250             dx33             = _mm_sub_pd(ix3,jx3);
251             dy33             = _mm_sub_pd(iy3,jy3);
252             dz33             = _mm_sub_pd(iz3,jz3);
253
254             /* Calculate squared distance and things based on it */
255             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
256             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
257             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
258             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
259             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
260             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
261             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
262             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
263             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
264             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
265
266             rinv00           = avx128fma_invsqrt_d(rsq00);
267             rinv11           = avx128fma_invsqrt_d(rsq11);
268             rinv12           = avx128fma_invsqrt_d(rsq12);
269             rinv13           = avx128fma_invsqrt_d(rsq13);
270             rinv21           = avx128fma_invsqrt_d(rsq21);
271             rinv22           = avx128fma_invsqrt_d(rsq22);
272             rinv23           = avx128fma_invsqrt_d(rsq23);
273             rinv31           = avx128fma_invsqrt_d(rsq31);
274             rinv32           = avx128fma_invsqrt_d(rsq32);
275             rinv33           = avx128fma_invsqrt_d(rsq33);
276
277             fjx0             = _mm_setzero_pd();
278             fjy0             = _mm_setzero_pd();
279             fjz0             = _mm_setzero_pd();
280             fjx1             = _mm_setzero_pd();
281             fjy1             = _mm_setzero_pd();
282             fjz1             = _mm_setzero_pd();
283             fjx2             = _mm_setzero_pd();
284             fjy2             = _mm_setzero_pd();
285             fjz2             = _mm_setzero_pd();
286             fjx3             = _mm_setzero_pd();
287             fjy3             = _mm_setzero_pd();
288             fjz3             = _mm_setzero_pd();
289
290             /**************************
291              * CALCULATE INTERACTIONS *
292              **************************/
293
294             r00              = _mm_mul_pd(rsq00,rinv00);
295
296             /* Calculate table index by multiplying r with table scale and truncate to integer */
297             rt               = _mm_mul_pd(r00,vftabscale);
298             vfitab           = _mm_cvttpd_epi32(rt);
299 #ifdef __XOP__
300             vfeps            = _mm_frcz_pd(rt);
301 #else
302             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
303 #endif
304             twovfeps         = _mm_add_pd(vfeps,vfeps);
305             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
306
307             /* CUBIC SPLINE TABLE DISPERSION */
308             vfitab           = _mm_add_epi32(vfitab,ifour);
309             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
310             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
311             GMX_MM_TRANSPOSE2_PD(Y,F);
312             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
313             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
314             GMX_MM_TRANSPOSE2_PD(G,H);
315             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
316             VV               = _mm_macc_pd(vfeps,Fp,Y);
317             vvdw6            = _mm_mul_pd(c6_00,VV);
318             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
319             fvdw6            = _mm_mul_pd(c6_00,FF);
320
321             /* CUBIC SPLINE TABLE REPULSION */
322             vfitab           = _mm_add_epi32(vfitab,ifour);
323             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
324             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
325             GMX_MM_TRANSPOSE2_PD(Y,F);
326             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
327             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
328             GMX_MM_TRANSPOSE2_PD(G,H);
329             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
330             VV               = _mm_macc_pd(vfeps,Fp,Y);
331             vvdw12           = _mm_mul_pd(c12_00,VV);
332             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
333             fvdw12           = _mm_mul_pd(c12_00,FF);
334             vvdw             = _mm_add_pd(vvdw12,vvdw6);
335             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
336
337             /* Update potential sum for this i atom from the interaction with this j atom. */
338             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
339
340             fscal            = fvdw;
341
342             /* Update vectorial force */
343             fix0             = _mm_macc_pd(dx00,fscal,fix0);
344             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
345             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
346             
347             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
348             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
349             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
350
351             /**************************
352              * CALCULATE INTERACTIONS *
353              **************************/
354
355             r11              = _mm_mul_pd(rsq11,rinv11);
356
357             /* Calculate table index by multiplying r with table scale and truncate to integer */
358             rt               = _mm_mul_pd(r11,vftabscale);
359             vfitab           = _mm_cvttpd_epi32(rt);
360 #ifdef __XOP__
361             vfeps            = _mm_frcz_pd(rt);
362 #else
363             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
364 #endif
365             twovfeps         = _mm_add_pd(vfeps,vfeps);
366             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
367
368             /* CUBIC SPLINE TABLE ELECTROSTATICS */
369             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
370             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
371             GMX_MM_TRANSPOSE2_PD(Y,F);
372             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
373             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
374             GMX_MM_TRANSPOSE2_PD(G,H);
375             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
376             VV               = _mm_macc_pd(vfeps,Fp,Y);
377             velec            = _mm_mul_pd(qq11,VV);
378             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
379             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
380
381             /* Update potential sum for this i atom from the interaction with this j atom. */
382             velecsum         = _mm_add_pd(velecsum,velec);
383
384             fscal            = felec;
385
386             /* Update vectorial force */
387             fix1             = _mm_macc_pd(dx11,fscal,fix1);
388             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
389             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
390             
391             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
392             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
393             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
394
395             /**************************
396              * CALCULATE INTERACTIONS *
397              **************************/
398
399             r12              = _mm_mul_pd(rsq12,rinv12);
400
401             /* Calculate table index by multiplying r with table scale and truncate to integer */
402             rt               = _mm_mul_pd(r12,vftabscale);
403             vfitab           = _mm_cvttpd_epi32(rt);
404 #ifdef __XOP__
405             vfeps            = _mm_frcz_pd(rt);
406 #else
407             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
408 #endif
409             twovfeps         = _mm_add_pd(vfeps,vfeps);
410             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
411
412             /* CUBIC SPLINE TABLE ELECTROSTATICS */
413             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
414             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
415             GMX_MM_TRANSPOSE2_PD(Y,F);
416             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
417             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
418             GMX_MM_TRANSPOSE2_PD(G,H);
419             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
420             VV               = _mm_macc_pd(vfeps,Fp,Y);
421             velec            = _mm_mul_pd(qq12,VV);
422             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
423             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
424
425             /* Update potential sum for this i atom from the interaction with this j atom. */
426             velecsum         = _mm_add_pd(velecsum,velec);
427
428             fscal            = felec;
429
430             /* Update vectorial force */
431             fix1             = _mm_macc_pd(dx12,fscal,fix1);
432             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
433             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
434             
435             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
436             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
437             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
438
439             /**************************
440              * CALCULATE INTERACTIONS *
441              **************************/
442
443             r13              = _mm_mul_pd(rsq13,rinv13);
444
445             /* Calculate table index by multiplying r with table scale and truncate to integer */
446             rt               = _mm_mul_pd(r13,vftabscale);
447             vfitab           = _mm_cvttpd_epi32(rt);
448 #ifdef __XOP__
449             vfeps            = _mm_frcz_pd(rt);
450 #else
451             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
452 #endif
453             twovfeps         = _mm_add_pd(vfeps,vfeps);
454             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
455
456             /* CUBIC SPLINE TABLE ELECTROSTATICS */
457             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
458             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
459             GMX_MM_TRANSPOSE2_PD(Y,F);
460             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
461             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
462             GMX_MM_TRANSPOSE2_PD(G,H);
463             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
464             VV               = _mm_macc_pd(vfeps,Fp,Y);
465             velec            = _mm_mul_pd(qq13,VV);
466             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
467             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
468
469             /* Update potential sum for this i atom from the interaction with this j atom. */
470             velecsum         = _mm_add_pd(velecsum,velec);
471
472             fscal            = felec;
473
474             /* Update vectorial force */
475             fix1             = _mm_macc_pd(dx13,fscal,fix1);
476             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
477             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
478             
479             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
480             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
481             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
482
483             /**************************
484              * CALCULATE INTERACTIONS *
485              **************************/
486
487             r21              = _mm_mul_pd(rsq21,rinv21);
488
489             /* Calculate table index by multiplying r with table scale and truncate to integer */
490             rt               = _mm_mul_pd(r21,vftabscale);
491             vfitab           = _mm_cvttpd_epi32(rt);
492 #ifdef __XOP__
493             vfeps            = _mm_frcz_pd(rt);
494 #else
495             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
496 #endif
497             twovfeps         = _mm_add_pd(vfeps,vfeps);
498             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
499
500             /* CUBIC SPLINE TABLE ELECTROSTATICS */
501             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
502             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
503             GMX_MM_TRANSPOSE2_PD(Y,F);
504             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
505             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
506             GMX_MM_TRANSPOSE2_PD(G,H);
507             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
508             VV               = _mm_macc_pd(vfeps,Fp,Y);
509             velec            = _mm_mul_pd(qq21,VV);
510             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
511             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
512
513             /* Update potential sum for this i atom from the interaction with this j atom. */
514             velecsum         = _mm_add_pd(velecsum,velec);
515
516             fscal            = felec;
517
518             /* Update vectorial force */
519             fix2             = _mm_macc_pd(dx21,fscal,fix2);
520             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
521             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
522             
523             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
524             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
525             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
526
527             /**************************
528              * CALCULATE INTERACTIONS *
529              **************************/
530
531             r22              = _mm_mul_pd(rsq22,rinv22);
532
533             /* Calculate table index by multiplying r with table scale and truncate to integer */
534             rt               = _mm_mul_pd(r22,vftabscale);
535             vfitab           = _mm_cvttpd_epi32(rt);
536 #ifdef __XOP__
537             vfeps            = _mm_frcz_pd(rt);
538 #else
539             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
540 #endif
541             twovfeps         = _mm_add_pd(vfeps,vfeps);
542             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
543
544             /* CUBIC SPLINE TABLE ELECTROSTATICS */
545             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
546             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
547             GMX_MM_TRANSPOSE2_PD(Y,F);
548             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
549             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
550             GMX_MM_TRANSPOSE2_PD(G,H);
551             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
552             VV               = _mm_macc_pd(vfeps,Fp,Y);
553             velec            = _mm_mul_pd(qq22,VV);
554             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
555             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
556
557             /* Update potential sum for this i atom from the interaction with this j atom. */
558             velecsum         = _mm_add_pd(velecsum,velec);
559
560             fscal            = felec;
561
562             /* Update vectorial force */
563             fix2             = _mm_macc_pd(dx22,fscal,fix2);
564             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
565             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
566             
567             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
568             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
569             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
570
571             /**************************
572              * CALCULATE INTERACTIONS *
573              **************************/
574
575             r23              = _mm_mul_pd(rsq23,rinv23);
576
577             /* Calculate table index by multiplying r with table scale and truncate to integer */
578             rt               = _mm_mul_pd(r23,vftabscale);
579             vfitab           = _mm_cvttpd_epi32(rt);
580 #ifdef __XOP__
581             vfeps            = _mm_frcz_pd(rt);
582 #else
583             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
584 #endif
585             twovfeps         = _mm_add_pd(vfeps,vfeps);
586             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
587
588             /* CUBIC SPLINE TABLE ELECTROSTATICS */
589             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
590             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
591             GMX_MM_TRANSPOSE2_PD(Y,F);
592             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
593             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
594             GMX_MM_TRANSPOSE2_PD(G,H);
595             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
596             VV               = _mm_macc_pd(vfeps,Fp,Y);
597             velec            = _mm_mul_pd(qq23,VV);
598             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
599             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
600
601             /* Update potential sum for this i atom from the interaction with this j atom. */
602             velecsum         = _mm_add_pd(velecsum,velec);
603
604             fscal            = felec;
605
606             /* Update vectorial force */
607             fix2             = _mm_macc_pd(dx23,fscal,fix2);
608             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
609             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
610             
611             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
612             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
613             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
614
615             /**************************
616              * CALCULATE INTERACTIONS *
617              **************************/
618
619             r31              = _mm_mul_pd(rsq31,rinv31);
620
621             /* Calculate table index by multiplying r with table scale and truncate to integer */
622             rt               = _mm_mul_pd(r31,vftabscale);
623             vfitab           = _mm_cvttpd_epi32(rt);
624 #ifdef __XOP__
625             vfeps            = _mm_frcz_pd(rt);
626 #else
627             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
628 #endif
629             twovfeps         = _mm_add_pd(vfeps,vfeps);
630             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
631
632             /* CUBIC SPLINE TABLE ELECTROSTATICS */
633             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
634             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
635             GMX_MM_TRANSPOSE2_PD(Y,F);
636             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
637             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
638             GMX_MM_TRANSPOSE2_PD(G,H);
639             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
640             VV               = _mm_macc_pd(vfeps,Fp,Y);
641             velec            = _mm_mul_pd(qq31,VV);
642             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
643             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
644
645             /* Update potential sum for this i atom from the interaction with this j atom. */
646             velecsum         = _mm_add_pd(velecsum,velec);
647
648             fscal            = felec;
649
650             /* Update vectorial force */
651             fix3             = _mm_macc_pd(dx31,fscal,fix3);
652             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
653             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
654             
655             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
656             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
657             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
658
659             /**************************
660              * CALCULATE INTERACTIONS *
661              **************************/
662
663             r32              = _mm_mul_pd(rsq32,rinv32);
664
665             /* Calculate table index by multiplying r with table scale and truncate to integer */
666             rt               = _mm_mul_pd(r32,vftabscale);
667             vfitab           = _mm_cvttpd_epi32(rt);
668 #ifdef __XOP__
669             vfeps            = _mm_frcz_pd(rt);
670 #else
671             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
672 #endif
673             twovfeps         = _mm_add_pd(vfeps,vfeps);
674             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
675
676             /* CUBIC SPLINE TABLE ELECTROSTATICS */
677             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
678             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
679             GMX_MM_TRANSPOSE2_PD(Y,F);
680             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
681             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
682             GMX_MM_TRANSPOSE2_PD(G,H);
683             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
684             VV               = _mm_macc_pd(vfeps,Fp,Y);
685             velec            = _mm_mul_pd(qq32,VV);
686             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
687             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
688
689             /* Update potential sum for this i atom from the interaction with this j atom. */
690             velecsum         = _mm_add_pd(velecsum,velec);
691
692             fscal            = felec;
693
694             /* Update vectorial force */
695             fix3             = _mm_macc_pd(dx32,fscal,fix3);
696             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
697             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
698             
699             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
700             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
701             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
702
703             /**************************
704              * CALCULATE INTERACTIONS *
705              **************************/
706
707             r33              = _mm_mul_pd(rsq33,rinv33);
708
709             /* Calculate table index by multiplying r with table scale and truncate to integer */
710             rt               = _mm_mul_pd(r33,vftabscale);
711             vfitab           = _mm_cvttpd_epi32(rt);
712 #ifdef __XOP__
713             vfeps            = _mm_frcz_pd(rt);
714 #else
715             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
716 #endif
717             twovfeps         = _mm_add_pd(vfeps,vfeps);
718             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
719
720             /* CUBIC SPLINE TABLE ELECTROSTATICS */
721             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
722             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
723             GMX_MM_TRANSPOSE2_PD(Y,F);
724             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
725             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
726             GMX_MM_TRANSPOSE2_PD(G,H);
727             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
728             VV               = _mm_macc_pd(vfeps,Fp,Y);
729             velec            = _mm_mul_pd(qq33,VV);
730             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
731             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
732
733             /* Update potential sum for this i atom from the interaction with this j atom. */
734             velecsum         = _mm_add_pd(velecsum,velec);
735
736             fscal            = felec;
737
738             /* Update vectorial force */
739             fix3             = _mm_macc_pd(dx33,fscal,fix3);
740             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
741             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
742             
743             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
744             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
745             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
746
747             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
748
749             /* Inner loop uses 476 flops */
750         }
751
752         if(jidx<j_index_end)
753         {
754
755             jnrA             = jjnr[jidx];
756             j_coord_offsetA  = DIM*jnrA;
757
758             /* load j atom coordinates */
759             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
760                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
761                                               &jy2,&jz2,&jx3,&jy3,&jz3);
762
763             /* Calculate displacement vector */
764             dx00             = _mm_sub_pd(ix0,jx0);
765             dy00             = _mm_sub_pd(iy0,jy0);
766             dz00             = _mm_sub_pd(iz0,jz0);
767             dx11             = _mm_sub_pd(ix1,jx1);
768             dy11             = _mm_sub_pd(iy1,jy1);
769             dz11             = _mm_sub_pd(iz1,jz1);
770             dx12             = _mm_sub_pd(ix1,jx2);
771             dy12             = _mm_sub_pd(iy1,jy2);
772             dz12             = _mm_sub_pd(iz1,jz2);
773             dx13             = _mm_sub_pd(ix1,jx3);
774             dy13             = _mm_sub_pd(iy1,jy3);
775             dz13             = _mm_sub_pd(iz1,jz3);
776             dx21             = _mm_sub_pd(ix2,jx1);
777             dy21             = _mm_sub_pd(iy2,jy1);
778             dz21             = _mm_sub_pd(iz2,jz1);
779             dx22             = _mm_sub_pd(ix2,jx2);
780             dy22             = _mm_sub_pd(iy2,jy2);
781             dz22             = _mm_sub_pd(iz2,jz2);
782             dx23             = _mm_sub_pd(ix2,jx3);
783             dy23             = _mm_sub_pd(iy2,jy3);
784             dz23             = _mm_sub_pd(iz2,jz3);
785             dx31             = _mm_sub_pd(ix3,jx1);
786             dy31             = _mm_sub_pd(iy3,jy1);
787             dz31             = _mm_sub_pd(iz3,jz1);
788             dx32             = _mm_sub_pd(ix3,jx2);
789             dy32             = _mm_sub_pd(iy3,jy2);
790             dz32             = _mm_sub_pd(iz3,jz2);
791             dx33             = _mm_sub_pd(ix3,jx3);
792             dy33             = _mm_sub_pd(iy3,jy3);
793             dz33             = _mm_sub_pd(iz3,jz3);
794
795             /* Calculate squared distance and things based on it */
796             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
797             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
798             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
799             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
800             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
801             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
802             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
803             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
804             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
805             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
806
807             rinv00           = avx128fma_invsqrt_d(rsq00);
808             rinv11           = avx128fma_invsqrt_d(rsq11);
809             rinv12           = avx128fma_invsqrt_d(rsq12);
810             rinv13           = avx128fma_invsqrt_d(rsq13);
811             rinv21           = avx128fma_invsqrt_d(rsq21);
812             rinv22           = avx128fma_invsqrt_d(rsq22);
813             rinv23           = avx128fma_invsqrt_d(rsq23);
814             rinv31           = avx128fma_invsqrt_d(rsq31);
815             rinv32           = avx128fma_invsqrt_d(rsq32);
816             rinv33           = avx128fma_invsqrt_d(rsq33);
817
818             fjx0             = _mm_setzero_pd();
819             fjy0             = _mm_setzero_pd();
820             fjz0             = _mm_setzero_pd();
821             fjx1             = _mm_setzero_pd();
822             fjy1             = _mm_setzero_pd();
823             fjz1             = _mm_setzero_pd();
824             fjx2             = _mm_setzero_pd();
825             fjy2             = _mm_setzero_pd();
826             fjz2             = _mm_setzero_pd();
827             fjx3             = _mm_setzero_pd();
828             fjy3             = _mm_setzero_pd();
829             fjz3             = _mm_setzero_pd();
830
831             /**************************
832              * CALCULATE INTERACTIONS *
833              **************************/
834
835             r00              = _mm_mul_pd(rsq00,rinv00);
836
837             /* Calculate table index by multiplying r with table scale and truncate to integer */
838             rt               = _mm_mul_pd(r00,vftabscale);
839             vfitab           = _mm_cvttpd_epi32(rt);
840 #ifdef __XOP__
841             vfeps            = _mm_frcz_pd(rt);
842 #else
843             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
844 #endif
845             twovfeps         = _mm_add_pd(vfeps,vfeps);
846             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
847
848             /* CUBIC SPLINE TABLE DISPERSION */
849             vfitab           = _mm_add_epi32(vfitab,ifour);
850             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
851             F                = _mm_setzero_pd();
852             GMX_MM_TRANSPOSE2_PD(Y,F);
853             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
854             H                = _mm_setzero_pd();
855             GMX_MM_TRANSPOSE2_PD(G,H);
856             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
857             VV               = _mm_macc_pd(vfeps,Fp,Y);
858             vvdw6            = _mm_mul_pd(c6_00,VV);
859             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
860             fvdw6            = _mm_mul_pd(c6_00,FF);
861
862             /* CUBIC SPLINE TABLE REPULSION */
863             vfitab           = _mm_add_epi32(vfitab,ifour);
864             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
865             F                = _mm_setzero_pd();
866             GMX_MM_TRANSPOSE2_PD(Y,F);
867             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
868             H                = _mm_setzero_pd();
869             GMX_MM_TRANSPOSE2_PD(G,H);
870             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
871             VV               = _mm_macc_pd(vfeps,Fp,Y);
872             vvdw12           = _mm_mul_pd(c12_00,VV);
873             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
874             fvdw12           = _mm_mul_pd(c12_00,FF);
875             vvdw             = _mm_add_pd(vvdw12,vvdw6);
876             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
877
878             /* Update potential sum for this i atom from the interaction with this j atom. */
879             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
880             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
881
882             fscal            = fvdw;
883
884             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
885
886             /* Update vectorial force */
887             fix0             = _mm_macc_pd(dx00,fscal,fix0);
888             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
889             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
890             
891             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
892             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
893             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
894
895             /**************************
896              * CALCULATE INTERACTIONS *
897              **************************/
898
899             r11              = _mm_mul_pd(rsq11,rinv11);
900
901             /* Calculate table index by multiplying r with table scale and truncate to integer */
902             rt               = _mm_mul_pd(r11,vftabscale);
903             vfitab           = _mm_cvttpd_epi32(rt);
904 #ifdef __XOP__
905             vfeps            = _mm_frcz_pd(rt);
906 #else
907             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
908 #endif
909             twovfeps         = _mm_add_pd(vfeps,vfeps);
910             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
911
912             /* CUBIC SPLINE TABLE ELECTROSTATICS */
913             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
914             F                = _mm_setzero_pd();
915             GMX_MM_TRANSPOSE2_PD(Y,F);
916             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
917             H                = _mm_setzero_pd();
918             GMX_MM_TRANSPOSE2_PD(G,H);
919             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
920             VV               = _mm_macc_pd(vfeps,Fp,Y);
921             velec            = _mm_mul_pd(qq11,VV);
922             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
923             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
924
925             /* Update potential sum for this i atom from the interaction with this j atom. */
926             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
927             velecsum         = _mm_add_pd(velecsum,velec);
928
929             fscal            = felec;
930
931             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
932
933             /* Update vectorial force */
934             fix1             = _mm_macc_pd(dx11,fscal,fix1);
935             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
936             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
937             
938             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
939             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
940             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
941
942             /**************************
943              * CALCULATE INTERACTIONS *
944              **************************/
945
946             r12              = _mm_mul_pd(rsq12,rinv12);
947
948             /* Calculate table index by multiplying r with table scale and truncate to integer */
949             rt               = _mm_mul_pd(r12,vftabscale);
950             vfitab           = _mm_cvttpd_epi32(rt);
951 #ifdef __XOP__
952             vfeps            = _mm_frcz_pd(rt);
953 #else
954             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
955 #endif
956             twovfeps         = _mm_add_pd(vfeps,vfeps);
957             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
958
959             /* CUBIC SPLINE TABLE ELECTROSTATICS */
960             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
961             F                = _mm_setzero_pd();
962             GMX_MM_TRANSPOSE2_PD(Y,F);
963             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
964             H                = _mm_setzero_pd();
965             GMX_MM_TRANSPOSE2_PD(G,H);
966             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
967             VV               = _mm_macc_pd(vfeps,Fp,Y);
968             velec            = _mm_mul_pd(qq12,VV);
969             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
970             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
971
972             /* Update potential sum for this i atom from the interaction with this j atom. */
973             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
974             velecsum         = _mm_add_pd(velecsum,velec);
975
976             fscal            = felec;
977
978             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
979
980             /* Update vectorial force */
981             fix1             = _mm_macc_pd(dx12,fscal,fix1);
982             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
983             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
984             
985             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
986             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
987             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
988
989             /**************************
990              * CALCULATE INTERACTIONS *
991              **************************/
992
993             r13              = _mm_mul_pd(rsq13,rinv13);
994
995             /* Calculate table index by multiplying r with table scale and truncate to integer */
996             rt               = _mm_mul_pd(r13,vftabscale);
997             vfitab           = _mm_cvttpd_epi32(rt);
998 #ifdef __XOP__
999             vfeps            = _mm_frcz_pd(rt);
1000 #else
1001             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1002 #endif
1003             twovfeps         = _mm_add_pd(vfeps,vfeps);
1004             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1005
1006             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1007             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1008             F                = _mm_setzero_pd();
1009             GMX_MM_TRANSPOSE2_PD(Y,F);
1010             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1011             H                = _mm_setzero_pd();
1012             GMX_MM_TRANSPOSE2_PD(G,H);
1013             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1014             VV               = _mm_macc_pd(vfeps,Fp,Y);
1015             velec            = _mm_mul_pd(qq13,VV);
1016             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1017             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1018
1019             /* Update potential sum for this i atom from the interaction with this j atom. */
1020             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1021             velecsum         = _mm_add_pd(velecsum,velec);
1022
1023             fscal            = felec;
1024
1025             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1026
1027             /* Update vectorial force */
1028             fix1             = _mm_macc_pd(dx13,fscal,fix1);
1029             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
1030             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
1031             
1032             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
1033             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
1034             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
1035
1036             /**************************
1037              * CALCULATE INTERACTIONS *
1038              **************************/
1039
1040             r21              = _mm_mul_pd(rsq21,rinv21);
1041
1042             /* Calculate table index by multiplying r with table scale and truncate to integer */
1043             rt               = _mm_mul_pd(r21,vftabscale);
1044             vfitab           = _mm_cvttpd_epi32(rt);
1045 #ifdef __XOP__
1046             vfeps            = _mm_frcz_pd(rt);
1047 #else
1048             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1049 #endif
1050             twovfeps         = _mm_add_pd(vfeps,vfeps);
1051             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1052
1053             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1054             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1055             F                = _mm_setzero_pd();
1056             GMX_MM_TRANSPOSE2_PD(Y,F);
1057             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1058             H                = _mm_setzero_pd();
1059             GMX_MM_TRANSPOSE2_PD(G,H);
1060             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1061             VV               = _mm_macc_pd(vfeps,Fp,Y);
1062             velec            = _mm_mul_pd(qq21,VV);
1063             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1064             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1065
1066             /* Update potential sum for this i atom from the interaction with this j atom. */
1067             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1068             velecsum         = _mm_add_pd(velecsum,velec);
1069
1070             fscal            = felec;
1071
1072             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1073
1074             /* Update vectorial force */
1075             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1076             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1077             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1078             
1079             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1080             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1081             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1082
1083             /**************************
1084              * CALCULATE INTERACTIONS *
1085              **************************/
1086
1087             r22              = _mm_mul_pd(rsq22,rinv22);
1088
1089             /* Calculate table index by multiplying r with table scale and truncate to integer */
1090             rt               = _mm_mul_pd(r22,vftabscale);
1091             vfitab           = _mm_cvttpd_epi32(rt);
1092 #ifdef __XOP__
1093             vfeps            = _mm_frcz_pd(rt);
1094 #else
1095             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1096 #endif
1097             twovfeps         = _mm_add_pd(vfeps,vfeps);
1098             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1099
1100             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1101             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1102             F                = _mm_setzero_pd();
1103             GMX_MM_TRANSPOSE2_PD(Y,F);
1104             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1105             H                = _mm_setzero_pd();
1106             GMX_MM_TRANSPOSE2_PD(G,H);
1107             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1108             VV               = _mm_macc_pd(vfeps,Fp,Y);
1109             velec            = _mm_mul_pd(qq22,VV);
1110             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1111             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1112
1113             /* Update potential sum for this i atom from the interaction with this j atom. */
1114             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1115             velecsum         = _mm_add_pd(velecsum,velec);
1116
1117             fscal            = felec;
1118
1119             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1120
1121             /* Update vectorial force */
1122             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1123             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1124             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1125             
1126             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1127             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1128             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1129
1130             /**************************
1131              * CALCULATE INTERACTIONS *
1132              **************************/
1133
1134             r23              = _mm_mul_pd(rsq23,rinv23);
1135
1136             /* Calculate table index by multiplying r with table scale and truncate to integer */
1137             rt               = _mm_mul_pd(r23,vftabscale);
1138             vfitab           = _mm_cvttpd_epi32(rt);
1139 #ifdef __XOP__
1140             vfeps            = _mm_frcz_pd(rt);
1141 #else
1142             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1143 #endif
1144             twovfeps         = _mm_add_pd(vfeps,vfeps);
1145             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1146
1147             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1148             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1149             F                = _mm_setzero_pd();
1150             GMX_MM_TRANSPOSE2_PD(Y,F);
1151             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1152             H                = _mm_setzero_pd();
1153             GMX_MM_TRANSPOSE2_PD(G,H);
1154             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1155             VV               = _mm_macc_pd(vfeps,Fp,Y);
1156             velec            = _mm_mul_pd(qq23,VV);
1157             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1158             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1159
1160             /* Update potential sum for this i atom from the interaction with this j atom. */
1161             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1162             velecsum         = _mm_add_pd(velecsum,velec);
1163
1164             fscal            = felec;
1165
1166             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1167
1168             /* Update vectorial force */
1169             fix2             = _mm_macc_pd(dx23,fscal,fix2);
1170             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
1171             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
1172             
1173             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
1174             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
1175             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
1176
1177             /**************************
1178              * CALCULATE INTERACTIONS *
1179              **************************/
1180
1181             r31              = _mm_mul_pd(rsq31,rinv31);
1182
1183             /* Calculate table index by multiplying r with table scale and truncate to integer */
1184             rt               = _mm_mul_pd(r31,vftabscale);
1185             vfitab           = _mm_cvttpd_epi32(rt);
1186 #ifdef __XOP__
1187             vfeps            = _mm_frcz_pd(rt);
1188 #else
1189             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1190 #endif
1191             twovfeps         = _mm_add_pd(vfeps,vfeps);
1192             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1193
1194             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1195             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1196             F                = _mm_setzero_pd();
1197             GMX_MM_TRANSPOSE2_PD(Y,F);
1198             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1199             H                = _mm_setzero_pd();
1200             GMX_MM_TRANSPOSE2_PD(G,H);
1201             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1202             VV               = _mm_macc_pd(vfeps,Fp,Y);
1203             velec            = _mm_mul_pd(qq31,VV);
1204             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1205             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1206
1207             /* Update potential sum for this i atom from the interaction with this j atom. */
1208             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1209             velecsum         = _mm_add_pd(velecsum,velec);
1210
1211             fscal            = felec;
1212
1213             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1214
1215             /* Update vectorial force */
1216             fix3             = _mm_macc_pd(dx31,fscal,fix3);
1217             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
1218             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
1219             
1220             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
1221             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
1222             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
1223
1224             /**************************
1225              * CALCULATE INTERACTIONS *
1226              **************************/
1227
1228             r32              = _mm_mul_pd(rsq32,rinv32);
1229
1230             /* Calculate table index by multiplying r with table scale and truncate to integer */
1231             rt               = _mm_mul_pd(r32,vftabscale);
1232             vfitab           = _mm_cvttpd_epi32(rt);
1233 #ifdef __XOP__
1234             vfeps            = _mm_frcz_pd(rt);
1235 #else
1236             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1237 #endif
1238             twovfeps         = _mm_add_pd(vfeps,vfeps);
1239             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1240
1241             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1242             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1243             F                = _mm_setzero_pd();
1244             GMX_MM_TRANSPOSE2_PD(Y,F);
1245             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1246             H                = _mm_setzero_pd();
1247             GMX_MM_TRANSPOSE2_PD(G,H);
1248             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1249             VV               = _mm_macc_pd(vfeps,Fp,Y);
1250             velec            = _mm_mul_pd(qq32,VV);
1251             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1252             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1253
1254             /* Update potential sum for this i atom from the interaction with this j atom. */
1255             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1256             velecsum         = _mm_add_pd(velecsum,velec);
1257
1258             fscal            = felec;
1259
1260             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1261
1262             /* Update vectorial force */
1263             fix3             = _mm_macc_pd(dx32,fscal,fix3);
1264             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
1265             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
1266             
1267             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
1268             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
1269             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
1270
1271             /**************************
1272              * CALCULATE INTERACTIONS *
1273              **************************/
1274
1275             r33              = _mm_mul_pd(rsq33,rinv33);
1276
1277             /* Calculate table index by multiplying r with table scale and truncate to integer */
1278             rt               = _mm_mul_pd(r33,vftabscale);
1279             vfitab           = _mm_cvttpd_epi32(rt);
1280 #ifdef __XOP__
1281             vfeps            = _mm_frcz_pd(rt);
1282 #else
1283             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1284 #endif
1285             twovfeps         = _mm_add_pd(vfeps,vfeps);
1286             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1287
1288             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1289             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1290             F                = _mm_setzero_pd();
1291             GMX_MM_TRANSPOSE2_PD(Y,F);
1292             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1293             H                = _mm_setzero_pd();
1294             GMX_MM_TRANSPOSE2_PD(G,H);
1295             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1296             VV               = _mm_macc_pd(vfeps,Fp,Y);
1297             velec            = _mm_mul_pd(qq33,VV);
1298             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1299             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1300
1301             /* Update potential sum for this i atom from the interaction with this j atom. */
1302             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1303             velecsum         = _mm_add_pd(velecsum,velec);
1304
1305             fscal            = felec;
1306
1307             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1308
1309             /* Update vectorial force */
1310             fix3             = _mm_macc_pd(dx33,fscal,fix3);
1311             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
1312             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
1313             
1314             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
1315             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
1316             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
1317
1318             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1319
1320             /* Inner loop uses 476 flops */
1321         }
1322
1323         /* End of innermost loop */
1324
1325         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1326                                               f+i_coord_offset,fshift+i_shift_offset);
1327
1328         ggid                        = gid[iidx];
1329         /* Update potential energies */
1330         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1331         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1332
1333         /* Increment number of inner iterations */
1334         inneriter                  += j_index_end - j_index_start;
1335
1336         /* Outer loop uses 26 flops */
1337     }
1338
1339     /* Increment number of outer iterations */
1340     outeriter        += nri;
1341
1342     /* Update outer/inner flops */
1343
1344     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*476);
1345 }
1346 /*
1347  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_double
1348  * Electrostatics interaction: CubicSplineTable
1349  * VdW interaction:            CubicSplineTable
1350  * Geometry:                   Water4-Water4
1351  * Calculate force/pot:        Force
1352  */
1353 void
1354 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_double
1355                     (t_nblist                    * gmx_restrict       nlist,
1356                      rvec                        * gmx_restrict          xx,
1357                      rvec                        * gmx_restrict          ff,
1358                      struct t_forcerec           * gmx_restrict          fr,
1359                      t_mdatoms                   * gmx_restrict     mdatoms,
1360                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1361                      t_nrnb                      * gmx_restrict        nrnb)
1362 {
1363     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1364      * just 0 for non-waters.
1365      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1366      * jnr indices corresponding to data put in the four positions in the SIMD register.
1367      */
1368     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1369     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1370     int              jnrA,jnrB;
1371     int              j_coord_offsetA,j_coord_offsetB;
1372     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1373     real             rcutoff_scalar;
1374     real             *shiftvec,*fshift,*x,*f;
1375     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1376     int              vdwioffset0;
1377     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1378     int              vdwioffset1;
1379     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1380     int              vdwioffset2;
1381     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1382     int              vdwioffset3;
1383     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1384     int              vdwjidx0A,vdwjidx0B;
1385     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1386     int              vdwjidx1A,vdwjidx1B;
1387     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1388     int              vdwjidx2A,vdwjidx2B;
1389     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1390     int              vdwjidx3A,vdwjidx3B;
1391     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1392     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1393     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1394     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1395     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1396     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1397     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1398     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1399     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1400     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1401     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1402     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1403     real             *charge;
1404     int              nvdwtype;
1405     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1406     int              *vdwtype;
1407     real             *vdwparam;
1408     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1409     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1410     __m128i          vfitab;
1411     __m128i          ifour       = _mm_set1_epi32(4);
1412     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
1413     real             *vftab;
1414     __m128d          dummy_mask,cutoff_mask;
1415     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1416     __m128d          one     = _mm_set1_pd(1.0);
1417     __m128d          two     = _mm_set1_pd(2.0);
1418     x                = xx[0];
1419     f                = ff[0];
1420
1421     nri              = nlist->nri;
1422     iinr             = nlist->iinr;
1423     jindex           = nlist->jindex;
1424     jjnr             = nlist->jjnr;
1425     shiftidx         = nlist->shift;
1426     gid              = nlist->gid;
1427     shiftvec         = fr->shift_vec[0];
1428     fshift           = fr->fshift[0];
1429     facel            = _mm_set1_pd(fr->ic->epsfac);
1430     charge           = mdatoms->chargeA;
1431     nvdwtype         = fr->ntype;
1432     vdwparam         = fr->nbfp;
1433     vdwtype          = mdatoms->typeA;
1434
1435     vftab            = kernel_data->table_elec_vdw->data;
1436     vftabscale       = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
1437
1438     /* Setup water-specific parameters */
1439     inr              = nlist->iinr[0];
1440     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1441     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1442     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1443     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1444
1445     jq1              = _mm_set1_pd(charge[inr+1]);
1446     jq2              = _mm_set1_pd(charge[inr+2]);
1447     jq3              = _mm_set1_pd(charge[inr+3]);
1448     vdwjidx0A        = 2*vdwtype[inr+0];
1449     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1450     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1451     qq11             = _mm_mul_pd(iq1,jq1);
1452     qq12             = _mm_mul_pd(iq1,jq2);
1453     qq13             = _mm_mul_pd(iq1,jq3);
1454     qq21             = _mm_mul_pd(iq2,jq1);
1455     qq22             = _mm_mul_pd(iq2,jq2);
1456     qq23             = _mm_mul_pd(iq2,jq3);
1457     qq31             = _mm_mul_pd(iq3,jq1);
1458     qq32             = _mm_mul_pd(iq3,jq2);
1459     qq33             = _mm_mul_pd(iq3,jq3);
1460
1461     /* Avoid stupid compiler warnings */
1462     jnrA = jnrB = 0;
1463     j_coord_offsetA = 0;
1464     j_coord_offsetB = 0;
1465
1466     outeriter        = 0;
1467     inneriter        = 0;
1468
1469     /* Start outer loop over neighborlists */
1470     for(iidx=0; iidx<nri; iidx++)
1471     {
1472         /* Load shift vector for this list */
1473         i_shift_offset   = DIM*shiftidx[iidx];
1474
1475         /* Load limits for loop over neighbors */
1476         j_index_start    = jindex[iidx];
1477         j_index_end      = jindex[iidx+1];
1478
1479         /* Get outer coordinate index */
1480         inr              = iinr[iidx];
1481         i_coord_offset   = DIM*inr;
1482
1483         /* Load i particle coords and add shift vector */
1484         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1485                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1486
1487         fix0             = _mm_setzero_pd();
1488         fiy0             = _mm_setzero_pd();
1489         fiz0             = _mm_setzero_pd();
1490         fix1             = _mm_setzero_pd();
1491         fiy1             = _mm_setzero_pd();
1492         fiz1             = _mm_setzero_pd();
1493         fix2             = _mm_setzero_pd();
1494         fiy2             = _mm_setzero_pd();
1495         fiz2             = _mm_setzero_pd();
1496         fix3             = _mm_setzero_pd();
1497         fiy3             = _mm_setzero_pd();
1498         fiz3             = _mm_setzero_pd();
1499
1500         /* Start inner kernel loop */
1501         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1502         {
1503
1504             /* Get j neighbor index, and coordinate index */
1505             jnrA             = jjnr[jidx];
1506             jnrB             = jjnr[jidx+1];
1507             j_coord_offsetA  = DIM*jnrA;
1508             j_coord_offsetB  = DIM*jnrB;
1509
1510             /* load j atom coordinates */
1511             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1512                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1513                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1514
1515             /* Calculate displacement vector */
1516             dx00             = _mm_sub_pd(ix0,jx0);
1517             dy00             = _mm_sub_pd(iy0,jy0);
1518             dz00             = _mm_sub_pd(iz0,jz0);
1519             dx11             = _mm_sub_pd(ix1,jx1);
1520             dy11             = _mm_sub_pd(iy1,jy1);
1521             dz11             = _mm_sub_pd(iz1,jz1);
1522             dx12             = _mm_sub_pd(ix1,jx2);
1523             dy12             = _mm_sub_pd(iy1,jy2);
1524             dz12             = _mm_sub_pd(iz1,jz2);
1525             dx13             = _mm_sub_pd(ix1,jx3);
1526             dy13             = _mm_sub_pd(iy1,jy3);
1527             dz13             = _mm_sub_pd(iz1,jz3);
1528             dx21             = _mm_sub_pd(ix2,jx1);
1529             dy21             = _mm_sub_pd(iy2,jy1);
1530             dz21             = _mm_sub_pd(iz2,jz1);
1531             dx22             = _mm_sub_pd(ix2,jx2);
1532             dy22             = _mm_sub_pd(iy2,jy2);
1533             dz22             = _mm_sub_pd(iz2,jz2);
1534             dx23             = _mm_sub_pd(ix2,jx3);
1535             dy23             = _mm_sub_pd(iy2,jy3);
1536             dz23             = _mm_sub_pd(iz2,jz3);
1537             dx31             = _mm_sub_pd(ix3,jx1);
1538             dy31             = _mm_sub_pd(iy3,jy1);
1539             dz31             = _mm_sub_pd(iz3,jz1);
1540             dx32             = _mm_sub_pd(ix3,jx2);
1541             dy32             = _mm_sub_pd(iy3,jy2);
1542             dz32             = _mm_sub_pd(iz3,jz2);
1543             dx33             = _mm_sub_pd(ix3,jx3);
1544             dy33             = _mm_sub_pd(iy3,jy3);
1545             dz33             = _mm_sub_pd(iz3,jz3);
1546
1547             /* Calculate squared distance and things based on it */
1548             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1549             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1550             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1551             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1552             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1553             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1554             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1555             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1556             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1557             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1558
1559             rinv00           = avx128fma_invsqrt_d(rsq00);
1560             rinv11           = avx128fma_invsqrt_d(rsq11);
1561             rinv12           = avx128fma_invsqrt_d(rsq12);
1562             rinv13           = avx128fma_invsqrt_d(rsq13);
1563             rinv21           = avx128fma_invsqrt_d(rsq21);
1564             rinv22           = avx128fma_invsqrt_d(rsq22);
1565             rinv23           = avx128fma_invsqrt_d(rsq23);
1566             rinv31           = avx128fma_invsqrt_d(rsq31);
1567             rinv32           = avx128fma_invsqrt_d(rsq32);
1568             rinv33           = avx128fma_invsqrt_d(rsq33);
1569
1570             fjx0             = _mm_setzero_pd();
1571             fjy0             = _mm_setzero_pd();
1572             fjz0             = _mm_setzero_pd();
1573             fjx1             = _mm_setzero_pd();
1574             fjy1             = _mm_setzero_pd();
1575             fjz1             = _mm_setzero_pd();
1576             fjx2             = _mm_setzero_pd();
1577             fjy2             = _mm_setzero_pd();
1578             fjz2             = _mm_setzero_pd();
1579             fjx3             = _mm_setzero_pd();
1580             fjy3             = _mm_setzero_pd();
1581             fjz3             = _mm_setzero_pd();
1582
1583             /**************************
1584              * CALCULATE INTERACTIONS *
1585              **************************/
1586
1587             r00              = _mm_mul_pd(rsq00,rinv00);
1588
1589             /* Calculate table index by multiplying r with table scale and truncate to integer */
1590             rt               = _mm_mul_pd(r00,vftabscale);
1591             vfitab           = _mm_cvttpd_epi32(rt);
1592 #ifdef __XOP__
1593             vfeps            = _mm_frcz_pd(rt);
1594 #else
1595             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1596 #endif
1597             twovfeps         = _mm_add_pd(vfeps,vfeps);
1598             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1599
1600             /* CUBIC SPLINE TABLE DISPERSION */
1601             vfitab           = _mm_add_epi32(vfitab,ifour);
1602             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1603             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1604             GMX_MM_TRANSPOSE2_PD(Y,F);
1605             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1606             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1607             GMX_MM_TRANSPOSE2_PD(G,H);
1608             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1609             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1610             fvdw6            = _mm_mul_pd(c6_00,FF);
1611
1612             /* CUBIC SPLINE TABLE REPULSION */
1613             vfitab           = _mm_add_epi32(vfitab,ifour);
1614             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1615             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1616             GMX_MM_TRANSPOSE2_PD(Y,F);
1617             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1618             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1619             GMX_MM_TRANSPOSE2_PD(G,H);
1620             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1621             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1622             fvdw12           = _mm_mul_pd(c12_00,FF);
1623             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1624
1625             fscal            = fvdw;
1626
1627             /* Update vectorial force */
1628             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1629             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1630             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1631             
1632             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1633             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1634             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1635
1636             /**************************
1637              * CALCULATE INTERACTIONS *
1638              **************************/
1639
1640             r11              = _mm_mul_pd(rsq11,rinv11);
1641
1642             /* Calculate table index by multiplying r with table scale and truncate to integer */
1643             rt               = _mm_mul_pd(r11,vftabscale);
1644             vfitab           = _mm_cvttpd_epi32(rt);
1645 #ifdef __XOP__
1646             vfeps            = _mm_frcz_pd(rt);
1647 #else
1648             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1649 #endif
1650             twovfeps         = _mm_add_pd(vfeps,vfeps);
1651             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1652
1653             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1654             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1655             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1656             GMX_MM_TRANSPOSE2_PD(Y,F);
1657             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1658             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1659             GMX_MM_TRANSPOSE2_PD(G,H);
1660             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1661             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1662             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1663
1664             fscal            = felec;
1665
1666             /* Update vectorial force */
1667             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1668             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1669             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1670             
1671             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1672             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1673             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1674
1675             /**************************
1676              * CALCULATE INTERACTIONS *
1677              **************************/
1678
1679             r12              = _mm_mul_pd(rsq12,rinv12);
1680
1681             /* Calculate table index by multiplying r with table scale and truncate to integer */
1682             rt               = _mm_mul_pd(r12,vftabscale);
1683             vfitab           = _mm_cvttpd_epi32(rt);
1684 #ifdef __XOP__
1685             vfeps            = _mm_frcz_pd(rt);
1686 #else
1687             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1688 #endif
1689             twovfeps         = _mm_add_pd(vfeps,vfeps);
1690             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1691
1692             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1693             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1694             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1695             GMX_MM_TRANSPOSE2_PD(Y,F);
1696             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1697             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1698             GMX_MM_TRANSPOSE2_PD(G,H);
1699             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1700             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1701             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1702
1703             fscal            = felec;
1704
1705             /* Update vectorial force */
1706             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1707             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1708             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1709             
1710             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1711             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1712             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1713
1714             /**************************
1715              * CALCULATE INTERACTIONS *
1716              **************************/
1717
1718             r13              = _mm_mul_pd(rsq13,rinv13);
1719
1720             /* Calculate table index by multiplying r with table scale and truncate to integer */
1721             rt               = _mm_mul_pd(r13,vftabscale);
1722             vfitab           = _mm_cvttpd_epi32(rt);
1723 #ifdef __XOP__
1724             vfeps            = _mm_frcz_pd(rt);
1725 #else
1726             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1727 #endif
1728             twovfeps         = _mm_add_pd(vfeps,vfeps);
1729             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1730
1731             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1732             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1733             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1734             GMX_MM_TRANSPOSE2_PD(Y,F);
1735             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1736             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1737             GMX_MM_TRANSPOSE2_PD(G,H);
1738             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1739             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1740             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1741
1742             fscal            = felec;
1743
1744             /* Update vectorial force */
1745             fix1             = _mm_macc_pd(dx13,fscal,fix1);
1746             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
1747             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
1748             
1749             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
1750             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
1751             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
1752
1753             /**************************
1754              * CALCULATE INTERACTIONS *
1755              **************************/
1756
1757             r21              = _mm_mul_pd(rsq21,rinv21);
1758
1759             /* Calculate table index by multiplying r with table scale and truncate to integer */
1760             rt               = _mm_mul_pd(r21,vftabscale);
1761             vfitab           = _mm_cvttpd_epi32(rt);
1762 #ifdef __XOP__
1763             vfeps            = _mm_frcz_pd(rt);
1764 #else
1765             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1766 #endif
1767             twovfeps         = _mm_add_pd(vfeps,vfeps);
1768             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1769
1770             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1771             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1772             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1773             GMX_MM_TRANSPOSE2_PD(Y,F);
1774             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1775             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1776             GMX_MM_TRANSPOSE2_PD(G,H);
1777             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1778             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1779             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1780
1781             fscal            = felec;
1782
1783             /* Update vectorial force */
1784             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1785             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1786             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1787             
1788             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1789             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1790             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1791
1792             /**************************
1793              * CALCULATE INTERACTIONS *
1794              **************************/
1795
1796             r22              = _mm_mul_pd(rsq22,rinv22);
1797
1798             /* Calculate table index by multiplying r with table scale and truncate to integer */
1799             rt               = _mm_mul_pd(r22,vftabscale);
1800             vfitab           = _mm_cvttpd_epi32(rt);
1801 #ifdef __XOP__
1802             vfeps            = _mm_frcz_pd(rt);
1803 #else
1804             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1805 #endif
1806             twovfeps         = _mm_add_pd(vfeps,vfeps);
1807             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1808
1809             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1810             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1811             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1812             GMX_MM_TRANSPOSE2_PD(Y,F);
1813             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1814             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1815             GMX_MM_TRANSPOSE2_PD(G,H);
1816             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1817             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1818             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1819
1820             fscal            = felec;
1821
1822             /* Update vectorial force */
1823             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1824             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1825             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1826             
1827             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1828             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1829             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1830
1831             /**************************
1832              * CALCULATE INTERACTIONS *
1833              **************************/
1834
1835             r23              = _mm_mul_pd(rsq23,rinv23);
1836
1837             /* Calculate table index by multiplying r with table scale and truncate to integer */
1838             rt               = _mm_mul_pd(r23,vftabscale);
1839             vfitab           = _mm_cvttpd_epi32(rt);
1840 #ifdef __XOP__
1841             vfeps            = _mm_frcz_pd(rt);
1842 #else
1843             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1844 #endif
1845             twovfeps         = _mm_add_pd(vfeps,vfeps);
1846             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1847
1848             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1849             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1850             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1851             GMX_MM_TRANSPOSE2_PD(Y,F);
1852             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1853             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1854             GMX_MM_TRANSPOSE2_PD(G,H);
1855             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1856             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1857             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1858
1859             fscal            = felec;
1860
1861             /* Update vectorial force */
1862             fix2             = _mm_macc_pd(dx23,fscal,fix2);
1863             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
1864             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
1865             
1866             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
1867             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
1868             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
1869
1870             /**************************
1871              * CALCULATE INTERACTIONS *
1872              **************************/
1873
1874             r31              = _mm_mul_pd(rsq31,rinv31);
1875
1876             /* Calculate table index by multiplying r with table scale and truncate to integer */
1877             rt               = _mm_mul_pd(r31,vftabscale);
1878             vfitab           = _mm_cvttpd_epi32(rt);
1879 #ifdef __XOP__
1880             vfeps            = _mm_frcz_pd(rt);
1881 #else
1882             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1883 #endif
1884             twovfeps         = _mm_add_pd(vfeps,vfeps);
1885             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1886
1887             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1888             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1889             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1890             GMX_MM_TRANSPOSE2_PD(Y,F);
1891             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1892             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1893             GMX_MM_TRANSPOSE2_PD(G,H);
1894             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1895             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1896             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1897
1898             fscal            = felec;
1899
1900             /* Update vectorial force */
1901             fix3             = _mm_macc_pd(dx31,fscal,fix3);
1902             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
1903             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
1904             
1905             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
1906             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
1907             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
1908
1909             /**************************
1910              * CALCULATE INTERACTIONS *
1911              **************************/
1912
1913             r32              = _mm_mul_pd(rsq32,rinv32);
1914
1915             /* Calculate table index by multiplying r with table scale and truncate to integer */
1916             rt               = _mm_mul_pd(r32,vftabscale);
1917             vfitab           = _mm_cvttpd_epi32(rt);
1918 #ifdef __XOP__
1919             vfeps            = _mm_frcz_pd(rt);
1920 #else
1921             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1922 #endif
1923             twovfeps         = _mm_add_pd(vfeps,vfeps);
1924             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1925
1926             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1927             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1928             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1929             GMX_MM_TRANSPOSE2_PD(Y,F);
1930             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1931             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1932             GMX_MM_TRANSPOSE2_PD(G,H);
1933             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1934             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1935             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1936
1937             fscal            = felec;
1938
1939             /* Update vectorial force */
1940             fix3             = _mm_macc_pd(dx32,fscal,fix3);
1941             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
1942             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
1943             
1944             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
1945             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
1946             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
1947
1948             /**************************
1949              * CALCULATE INTERACTIONS *
1950              **************************/
1951
1952             r33              = _mm_mul_pd(rsq33,rinv33);
1953
1954             /* Calculate table index by multiplying r with table scale and truncate to integer */
1955             rt               = _mm_mul_pd(r33,vftabscale);
1956             vfitab           = _mm_cvttpd_epi32(rt);
1957 #ifdef __XOP__
1958             vfeps            = _mm_frcz_pd(rt);
1959 #else
1960             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1961 #endif
1962             twovfeps         = _mm_add_pd(vfeps,vfeps);
1963             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1964
1965             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1966             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1967             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1968             GMX_MM_TRANSPOSE2_PD(Y,F);
1969             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1970             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1971             GMX_MM_TRANSPOSE2_PD(G,H);
1972             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1973             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1974             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1975
1976             fscal            = felec;
1977
1978             /* Update vectorial force */
1979             fix3             = _mm_macc_pd(dx33,fscal,fix3);
1980             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
1981             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
1982             
1983             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
1984             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
1985             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
1986
1987             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1988
1989             /* Inner loop uses 432 flops */
1990         }
1991
1992         if(jidx<j_index_end)
1993         {
1994
1995             jnrA             = jjnr[jidx];
1996             j_coord_offsetA  = DIM*jnrA;
1997
1998             /* load j atom coordinates */
1999             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2000                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2001                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2002
2003             /* Calculate displacement vector */
2004             dx00             = _mm_sub_pd(ix0,jx0);
2005             dy00             = _mm_sub_pd(iy0,jy0);
2006             dz00             = _mm_sub_pd(iz0,jz0);
2007             dx11             = _mm_sub_pd(ix1,jx1);
2008             dy11             = _mm_sub_pd(iy1,jy1);
2009             dz11             = _mm_sub_pd(iz1,jz1);
2010             dx12             = _mm_sub_pd(ix1,jx2);
2011             dy12             = _mm_sub_pd(iy1,jy2);
2012             dz12             = _mm_sub_pd(iz1,jz2);
2013             dx13             = _mm_sub_pd(ix1,jx3);
2014             dy13             = _mm_sub_pd(iy1,jy3);
2015             dz13             = _mm_sub_pd(iz1,jz3);
2016             dx21             = _mm_sub_pd(ix2,jx1);
2017             dy21             = _mm_sub_pd(iy2,jy1);
2018             dz21             = _mm_sub_pd(iz2,jz1);
2019             dx22             = _mm_sub_pd(ix2,jx2);
2020             dy22             = _mm_sub_pd(iy2,jy2);
2021             dz22             = _mm_sub_pd(iz2,jz2);
2022             dx23             = _mm_sub_pd(ix2,jx3);
2023             dy23             = _mm_sub_pd(iy2,jy3);
2024             dz23             = _mm_sub_pd(iz2,jz3);
2025             dx31             = _mm_sub_pd(ix3,jx1);
2026             dy31             = _mm_sub_pd(iy3,jy1);
2027             dz31             = _mm_sub_pd(iz3,jz1);
2028             dx32             = _mm_sub_pd(ix3,jx2);
2029             dy32             = _mm_sub_pd(iy3,jy2);
2030             dz32             = _mm_sub_pd(iz3,jz2);
2031             dx33             = _mm_sub_pd(ix3,jx3);
2032             dy33             = _mm_sub_pd(iy3,jy3);
2033             dz33             = _mm_sub_pd(iz3,jz3);
2034
2035             /* Calculate squared distance and things based on it */
2036             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2037             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2038             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2039             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2040             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2041             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2042             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2043             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2044             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2045             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2046
2047             rinv00           = avx128fma_invsqrt_d(rsq00);
2048             rinv11           = avx128fma_invsqrt_d(rsq11);
2049             rinv12           = avx128fma_invsqrt_d(rsq12);
2050             rinv13           = avx128fma_invsqrt_d(rsq13);
2051             rinv21           = avx128fma_invsqrt_d(rsq21);
2052             rinv22           = avx128fma_invsqrt_d(rsq22);
2053             rinv23           = avx128fma_invsqrt_d(rsq23);
2054             rinv31           = avx128fma_invsqrt_d(rsq31);
2055             rinv32           = avx128fma_invsqrt_d(rsq32);
2056             rinv33           = avx128fma_invsqrt_d(rsq33);
2057
2058             fjx0             = _mm_setzero_pd();
2059             fjy0             = _mm_setzero_pd();
2060             fjz0             = _mm_setzero_pd();
2061             fjx1             = _mm_setzero_pd();
2062             fjy1             = _mm_setzero_pd();
2063             fjz1             = _mm_setzero_pd();
2064             fjx2             = _mm_setzero_pd();
2065             fjy2             = _mm_setzero_pd();
2066             fjz2             = _mm_setzero_pd();
2067             fjx3             = _mm_setzero_pd();
2068             fjy3             = _mm_setzero_pd();
2069             fjz3             = _mm_setzero_pd();
2070
2071             /**************************
2072              * CALCULATE INTERACTIONS *
2073              **************************/
2074
2075             r00              = _mm_mul_pd(rsq00,rinv00);
2076
2077             /* Calculate table index by multiplying r with table scale and truncate to integer */
2078             rt               = _mm_mul_pd(r00,vftabscale);
2079             vfitab           = _mm_cvttpd_epi32(rt);
2080 #ifdef __XOP__
2081             vfeps            = _mm_frcz_pd(rt);
2082 #else
2083             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2084 #endif
2085             twovfeps         = _mm_add_pd(vfeps,vfeps);
2086             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2087
2088             /* CUBIC SPLINE TABLE DISPERSION */
2089             vfitab           = _mm_add_epi32(vfitab,ifour);
2090             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2091             F                = _mm_setzero_pd();
2092             GMX_MM_TRANSPOSE2_PD(Y,F);
2093             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2094             H                = _mm_setzero_pd();
2095             GMX_MM_TRANSPOSE2_PD(G,H);
2096             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
2097             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
2098             fvdw6            = _mm_mul_pd(c6_00,FF);
2099
2100             /* CUBIC SPLINE TABLE REPULSION */
2101             vfitab           = _mm_add_epi32(vfitab,ifour);
2102             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2103             F                = _mm_setzero_pd();
2104             GMX_MM_TRANSPOSE2_PD(Y,F);
2105             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2106             H                = _mm_setzero_pd();
2107             GMX_MM_TRANSPOSE2_PD(G,H);
2108             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
2109             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
2110             fvdw12           = _mm_mul_pd(c12_00,FF);
2111             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
2112
2113             fscal            = fvdw;
2114
2115             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2116
2117             /* Update vectorial force */
2118             fix0             = _mm_macc_pd(dx00,fscal,fix0);
2119             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
2120             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
2121             
2122             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
2123             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
2124             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
2125
2126             /**************************
2127              * CALCULATE INTERACTIONS *
2128              **************************/
2129
2130             r11              = _mm_mul_pd(rsq11,rinv11);
2131
2132             /* Calculate table index by multiplying r with table scale and truncate to integer */
2133             rt               = _mm_mul_pd(r11,vftabscale);
2134             vfitab           = _mm_cvttpd_epi32(rt);
2135 #ifdef __XOP__
2136             vfeps            = _mm_frcz_pd(rt);
2137 #else
2138             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2139 #endif
2140             twovfeps         = _mm_add_pd(vfeps,vfeps);
2141             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2142
2143             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2144             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2145             F                = _mm_setzero_pd();
2146             GMX_MM_TRANSPOSE2_PD(Y,F);
2147             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2148             H                = _mm_setzero_pd();
2149             GMX_MM_TRANSPOSE2_PD(G,H);
2150             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2151             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2152             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2153
2154             fscal            = felec;
2155
2156             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2157
2158             /* Update vectorial force */
2159             fix1             = _mm_macc_pd(dx11,fscal,fix1);
2160             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
2161             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
2162             
2163             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
2164             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
2165             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
2166
2167             /**************************
2168              * CALCULATE INTERACTIONS *
2169              **************************/
2170
2171             r12              = _mm_mul_pd(rsq12,rinv12);
2172
2173             /* Calculate table index by multiplying r with table scale and truncate to integer */
2174             rt               = _mm_mul_pd(r12,vftabscale);
2175             vfitab           = _mm_cvttpd_epi32(rt);
2176 #ifdef __XOP__
2177             vfeps            = _mm_frcz_pd(rt);
2178 #else
2179             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2180 #endif
2181             twovfeps         = _mm_add_pd(vfeps,vfeps);
2182             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2183
2184             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2185             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2186             F                = _mm_setzero_pd();
2187             GMX_MM_TRANSPOSE2_PD(Y,F);
2188             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2189             H                = _mm_setzero_pd();
2190             GMX_MM_TRANSPOSE2_PD(G,H);
2191             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2192             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2193             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2194
2195             fscal            = felec;
2196
2197             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2198
2199             /* Update vectorial force */
2200             fix1             = _mm_macc_pd(dx12,fscal,fix1);
2201             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
2202             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
2203             
2204             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
2205             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
2206             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
2207
2208             /**************************
2209              * CALCULATE INTERACTIONS *
2210              **************************/
2211
2212             r13              = _mm_mul_pd(rsq13,rinv13);
2213
2214             /* Calculate table index by multiplying r with table scale and truncate to integer */
2215             rt               = _mm_mul_pd(r13,vftabscale);
2216             vfitab           = _mm_cvttpd_epi32(rt);
2217 #ifdef __XOP__
2218             vfeps            = _mm_frcz_pd(rt);
2219 #else
2220             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2221 #endif
2222             twovfeps         = _mm_add_pd(vfeps,vfeps);
2223             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2224
2225             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2226             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2227             F                = _mm_setzero_pd();
2228             GMX_MM_TRANSPOSE2_PD(Y,F);
2229             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2230             H                = _mm_setzero_pd();
2231             GMX_MM_TRANSPOSE2_PD(G,H);
2232             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2233             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2234             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
2235
2236             fscal            = felec;
2237
2238             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2239
2240             /* Update vectorial force */
2241             fix1             = _mm_macc_pd(dx13,fscal,fix1);
2242             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
2243             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
2244             
2245             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
2246             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
2247             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
2248
2249             /**************************
2250              * CALCULATE INTERACTIONS *
2251              **************************/
2252
2253             r21              = _mm_mul_pd(rsq21,rinv21);
2254
2255             /* Calculate table index by multiplying r with table scale and truncate to integer */
2256             rt               = _mm_mul_pd(r21,vftabscale);
2257             vfitab           = _mm_cvttpd_epi32(rt);
2258 #ifdef __XOP__
2259             vfeps            = _mm_frcz_pd(rt);
2260 #else
2261             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2262 #endif
2263             twovfeps         = _mm_add_pd(vfeps,vfeps);
2264             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2265
2266             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2267             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2268             F                = _mm_setzero_pd();
2269             GMX_MM_TRANSPOSE2_PD(Y,F);
2270             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2271             H                = _mm_setzero_pd();
2272             GMX_MM_TRANSPOSE2_PD(G,H);
2273             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2274             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2275             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2276
2277             fscal            = felec;
2278
2279             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2280
2281             /* Update vectorial force */
2282             fix2             = _mm_macc_pd(dx21,fscal,fix2);
2283             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
2284             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
2285             
2286             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
2287             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
2288             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
2289
2290             /**************************
2291              * CALCULATE INTERACTIONS *
2292              **************************/
2293
2294             r22              = _mm_mul_pd(rsq22,rinv22);
2295
2296             /* Calculate table index by multiplying r with table scale and truncate to integer */
2297             rt               = _mm_mul_pd(r22,vftabscale);
2298             vfitab           = _mm_cvttpd_epi32(rt);
2299 #ifdef __XOP__
2300             vfeps            = _mm_frcz_pd(rt);
2301 #else
2302             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2303 #endif
2304             twovfeps         = _mm_add_pd(vfeps,vfeps);
2305             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2306
2307             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2308             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2309             F                = _mm_setzero_pd();
2310             GMX_MM_TRANSPOSE2_PD(Y,F);
2311             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2312             H                = _mm_setzero_pd();
2313             GMX_MM_TRANSPOSE2_PD(G,H);
2314             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2315             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2316             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2317
2318             fscal            = felec;
2319
2320             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2321
2322             /* Update vectorial force */
2323             fix2             = _mm_macc_pd(dx22,fscal,fix2);
2324             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
2325             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
2326             
2327             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
2328             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
2329             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
2330
2331             /**************************
2332              * CALCULATE INTERACTIONS *
2333              **************************/
2334
2335             r23              = _mm_mul_pd(rsq23,rinv23);
2336
2337             /* Calculate table index by multiplying r with table scale and truncate to integer */
2338             rt               = _mm_mul_pd(r23,vftabscale);
2339             vfitab           = _mm_cvttpd_epi32(rt);
2340 #ifdef __XOP__
2341             vfeps            = _mm_frcz_pd(rt);
2342 #else
2343             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2344 #endif
2345             twovfeps         = _mm_add_pd(vfeps,vfeps);
2346             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2347
2348             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2349             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2350             F                = _mm_setzero_pd();
2351             GMX_MM_TRANSPOSE2_PD(Y,F);
2352             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2353             H                = _mm_setzero_pd();
2354             GMX_MM_TRANSPOSE2_PD(G,H);
2355             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2356             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2357             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
2358
2359             fscal            = felec;
2360
2361             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2362
2363             /* Update vectorial force */
2364             fix2             = _mm_macc_pd(dx23,fscal,fix2);
2365             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
2366             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
2367             
2368             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
2369             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
2370             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
2371
2372             /**************************
2373              * CALCULATE INTERACTIONS *
2374              **************************/
2375
2376             r31              = _mm_mul_pd(rsq31,rinv31);
2377
2378             /* Calculate table index by multiplying r with table scale and truncate to integer */
2379             rt               = _mm_mul_pd(r31,vftabscale);
2380             vfitab           = _mm_cvttpd_epi32(rt);
2381 #ifdef __XOP__
2382             vfeps            = _mm_frcz_pd(rt);
2383 #else
2384             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2385 #endif
2386             twovfeps         = _mm_add_pd(vfeps,vfeps);
2387             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2388
2389             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2390             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2391             F                = _mm_setzero_pd();
2392             GMX_MM_TRANSPOSE2_PD(Y,F);
2393             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2394             H                = _mm_setzero_pd();
2395             GMX_MM_TRANSPOSE2_PD(G,H);
2396             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2397             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2398             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
2399
2400             fscal            = felec;
2401
2402             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2403
2404             /* Update vectorial force */
2405             fix3             = _mm_macc_pd(dx31,fscal,fix3);
2406             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
2407             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
2408             
2409             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
2410             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
2411             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
2412
2413             /**************************
2414              * CALCULATE INTERACTIONS *
2415              **************************/
2416
2417             r32              = _mm_mul_pd(rsq32,rinv32);
2418
2419             /* Calculate table index by multiplying r with table scale and truncate to integer */
2420             rt               = _mm_mul_pd(r32,vftabscale);
2421             vfitab           = _mm_cvttpd_epi32(rt);
2422 #ifdef __XOP__
2423             vfeps            = _mm_frcz_pd(rt);
2424 #else
2425             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2426 #endif
2427             twovfeps         = _mm_add_pd(vfeps,vfeps);
2428             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2429
2430             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2431             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2432             F                = _mm_setzero_pd();
2433             GMX_MM_TRANSPOSE2_PD(Y,F);
2434             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2435             H                = _mm_setzero_pd();
2436             GMX_MM_TRANSPOSE2_PD(G,H);
2437             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2438             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2439             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
2440
2441             fscal            = felec;
2442
2443             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2444
2445             /* Update vectorial force */
2446             fix3             = _mm_macc_pd(dx32,fscal,fix3);
2447             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
2448             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
2449             
2450             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
2451             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
2452             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
2453
2454             /**************************
2455              * CALCULATE INTERACTIONS *
2456              **************************/
2457
2458             r33              = _mm_mul_pd(rsq33,rinv33);
2459
2460             /* Calculate table index by multiplying r with table scale and truncate to integer */
2461             rt               = _mm_mul_pd(r33,vftabscale);
2462             vfitab           = _mm_cvttpd_epi32(rt);
2463 #ifdef __XOP__
2464             vfeps            = _mm_frcz_pd(rt);
2465 #else
2466             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2467 #endif
2468             twovfeps         = _mm_add_pd(vfeps,vfeps);
2469             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2470
2471             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2472             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2473             F                = _mm_setzero_pd();
2474             GMX_MM_TRANSPOSE2_PD(Y,F);
2475             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2476             H                = _mm_setzero_pd();
2477             GMX_MM_TRANSPOSE2_PD(G,H);
2478             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2479             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2480             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
2481
2482             fscal            = felec;
2483
2484             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2485
2486             /* Update vectorial force */
2487             fix3             = _mm_macc_pd(dx33,fscal,fix3);
2488             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
2489             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
2490             
2491             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
2492             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
2493             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
2494
2495             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2496
2497             /* Inner loop uses 432 flops */
2498         }
2499
2500         /* End of innermost loop */
2501
2502         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2503                                               f+i_coord_offset,fshift+i_shift_offset);
2504
2505         /* Increment number of inner iterations */
2506         inneriter                  += j_index_end - j_index_start;
2507
2508         /* Outer loop uses 24 flops */
2509     }
2510
2511     /* Increment number of outer iterations */
2512     outeriter        += nri;
2513
2514     /* Update outer/inner flops */
2515
2516     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*432);
2517 }