Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_double / nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_avx_128_fma_double.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_128_fma_double.h"
48 #include "kernelutil_x86_avx_128_fma_double.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_double
52  * Electrostatics interaction: CubicSplineTable
53  * VdW interaction:            CubicSplineTable
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_double
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68      * just 0 for non-waters.
69      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB;
75     int              j_coord_offsetA,j_coord_offsetB;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80     int              vdwioffset0;
81     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82     int              vdwioffset1;
83     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84     int              vdwioffset2;
85     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86     int              vdwioffset3;
87     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
88     int              vdwjidx0A,vdwjidx0B;
89     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     int              vdwjidx1A,vdwjidx1B;
91     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92     int              vdwjidx2A,vdwjidx2B;
93     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94     int              vdwjidx3A,vdwjidx3B;
95     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
96     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     int              nvdwtype;
109     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110     int              *vdwtype;
111     real             *vdwparam;
112     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
113     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
114     __m128i          vfitab;
115     __m128i          ifour       = _mm_set1_epi32(4);
116     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
117     real             *vftab;
118     __m128d          dummy_mask,cutoff_mask;
119     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
120     __m128d          one     = _mm_set1_pd(1.0);
121     __m128d          two     = _mm_set1_pd(2.0);
122     x                = xx[0];
123     f                = ff[0];
124
125     nri              = nlist->nri;
126     iinr             = nlist->iinr;
127     jindex           = nlist->jindex;
128     jjnr             = nlist->jjnr;
129     shiftidx         = nlist->shift;
130     gid              = nlist->gid;
131     shiftvec         = fr->shift_vec[0];
132     fshift           = fr->fshift[0];
133     facel            = _mm_set1_pd(fr->epsfac);
134     charge           = mdatoms->chargeA;
135     nvdwtype         = fr->ntype;
136     vdwparam         = fr->nbfp;
137     vdwtype          = mdatoms->typeA;
138
139     vftab            = kernel_data->table_elec_vdw->data;
140     vftabscale       = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
141
142     /* Setup water-specific parameters */
143     inr              = nlist->iinr[0];
144     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
145     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
146     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
147     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
148
149     jq1              = _mm_set1_pd(charge[inr+1]);
150     jq2              = _mm_set1_pd(charge[inr+2]);
151     jq3              = _mm_set1_pd(charge[inr+3]);
152     vdwjidx0A        = 2*vdwtype[inr+0];
153     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
154     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
155     qq11             = _mm_mul_pd(iq1,jq1);
156     qq12             = _mm_mul_pd(iq1,jq2);
157     qq13             = _mm_mul_pd(iq1,jq3);
158     qq21             = _mm_mul_pd(iq2,jq1);
159     qq22             = _mm_mul_pd(iq2,jq2);
160     qq23             = _mm_mul_pd(iq2,jq3);
161     qq31             = _mm_mul_pd(iq3,jq1);
162     qq32             = _mm_mul_pd(iq3,jq2);
163     qq33             = _mm_mul_pd(iq3,jq3);
164
165     /* Avoid stupid compiler warnings */
166     jnrA = jnrB = 0;
167     j_coord_offsetA = 0;
168     j_coord_offsetB = 0;
169
170     outeriter        = 0;
171     inneriter        = 0;
172
173     /* Start outer loop over neighborlists */
174     for(iidx=0; iidx<nri; iidx++)
175     {
176         /* Load shift vector for this list */
177         i_shift_offset   = DIM*shiftidx[iidx];
178
179         /* Load limits for loop over neighbors */
180         j_index_start    = jindex[iidx];
181         j_index_end      = jindex[iidx+1];
182
183         /* Get outer coordinate index */
184         inr              = iinr[iidx];
185         i_coord_offset   = DIM*inr;
186
187         /* Load i particle coords and add shift vector */
188         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
189                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
190
191         fix0             = _mm_setzero_pd();
192         fiy0             = _mm_setzero_pd();
193         fiz0             = _mm_setzero_pd();
194         fix1             = _mm_setzero_pd();
195         fiy1             = _mm_setzero_pd();
196         fiz1             = _mm_setzero_pd();
197         fix2             = _mm_setzero_pd();
198         fiy2             = _mm_setzero_pd();
199         fiz2             = _mm_setzero_pd();
200         fix3             = _mm_setzero_pd();
201         fiy3             = _mm_setzero_pd();
202         fiz3             = _mm_setzero_pd();
203
204         /* Reset potential sums */
205         velecsum         = _mm_setzero_pd();
206         vvdwsum          = _mm_setzero_pd();
207
208         /* Start inner kernel loop */
209         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
210         {
211
212             /* Get j neighbor index, and coordinate index */
213             jnrA             = jjnr[jidx];
214             jnrB             = jjnr[jidx+1];
215             j_coord_offsetA  = DIM*jnrA;
216             j_coord_offsetB  = DIM*jnrB;
217
218             /* load j atom coordinates */
219             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
220                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
221                                               &jy2,&jz2,&jx3,&jy3,&jz3);
222
223             /* Calculate displacement vector */
224             dx00             = _mm_sub_pd(ix0,jx0);
225             dy00             = _mm_sub_pd(iy0,jy0);
226             dz00             = _mm_sub_pd(iz0,jz0);
227             dx11             = _mm_sub_pd(ix1,jx1);
228             dy11             = _mm_sub_pd(iy1,jy1);
229             dz11             = _mm_sub_pd(iz1,jz1);
230             dx12             = _mm_sub_pd(ix1,jx2);
231             dy12             = _mm_sub_pd(iy1,jy2);
232             dz12             = _mm_sub_pd(iz1,jz2);
233             dx13             = _mm_sub_pd(ix1,jx3);
234             dy13             = _mm_sub_pd(iy1,jy3);
235             dz13             = _mm_sub_pd(iz1,jz3);
236             dx21             = _mm_sub_pd(ix2,jx1);
237             dy21             = _mm_sub_pd(iy2,jy1);
238             dz21             = _mm_sub_pd(iz2,jz1);
239             dx22             = _mm_sub_pd(ix2,jx2);
240             dy22             = _mm_sub_pd(iy2,jy2);
241             dz22             = _mm_sub_pd(iz2,jz2);
242             dx23             = _mm_sub_pd(ix2,jx3);
243             dy23             = _mm_sub_pd(iy2,jy3);
244             dz23             = _mm_sub_pd(iz2,jz3);
245             dx31             = _mm_sub_pd(ix3,jx1);
246             dy31             = _mm_sub_pd(iy3,jy1);
247             dz31             = _mm_sub_pd(iz3,jz1);
248             dx32             = _mm_sub_pd(ix3,jx2);
249             dy32             = _mm_sub_pd(iy3,jy2);
250             dz32             = _mm_sub_pd(iz3,jz2);
251             dx33             = _mm_sub_pd(ix3,jx3);
252             dy33             = _mm_sub_pd(iy3,jy3);
253             dz33             = _mm_sub_pd(iz3,jz3);
254
255             /* Calculate squared distance and things based on it */
256             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
257             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
258             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
259             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
260             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
261             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
262             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
263             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
264             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
265             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
266
267             rinv00           = gmx_mm_invsqrt_pd(rsq00);
268             rinv11           = gmx_mm_invsqrt_pd(rsq11);
269             rinv12           = gmx_mm_invsqrt_pd(rsq12);
270             rinv13           = gmx_mm_invsqrt_pd(rsq13);
271             rinv21           = gmx_mm_invsqrt_pd(rsq21);
272             rinv22           = gmx_mm_invsqrt_pd(rsq22);
273             rinv23           = gmx_mm_invsqrt_pd(rsq23);
274             rinv31           = gmx_mm_invsqrt_pd(rsq31);
275             rinv32           = gmx_mm_invsqrt_pd(rsq32);
276             rinv33           = gmx_mm_invsqrt_pd(rsq33);
277
278             fjx0             = _mm_setzero_pd();
279             fjy0             = _mm_setzero_pd();
280             fjz0             = _mm_setzero_pd();
281             fjx1             = _mm_setzero_pd();
282             fjy1             = _mm_setzero_pd();
283             fjz1             = _mm_setzero_pd();
284             fjx2             = _mm_setzero_pd();
285             fjy2             = _mm_setzero_pd();
286             fjz2             = _mm_setzero_pd();
287             fjx3             = _mm_setzero_pd();
288             fjy3             = _mm_setzero_pd();
289             fjz3             = _mm_setzero_pd();
290
291             /**************************
292              * CALCULATE INTERACTIONS *
293              **************************/
294
295             r00              = _mm_mul_pd(rsq00,rinv00);
296
297             /* Calculate table index by multiplying r with table scale and truncate to integer */
298             rt               = _mm_mul_pd(r00,vftabscale);
299             vfitab           = _mm_cvttpd_epi32(rt);
300 #ifdef __XOP__
301             vfeps            = _mm_frcz_pd(rt);
302 #else
303             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
304 #endif
305             twovfeps         = _mm_add_pd(vfeps,vfeps);
306             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
307
308             /* CUBIC SPLINE TABLE DISPERSION */
309             vfitab           = _mm_add_epi32(vfitab,ifour);
310             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
311             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
312             GMX_MM_TRANSPOSE2_PD(Y,F);
313             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
314             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
315             GMX_MM_TRANSPOSE2_PD(G,H);
316             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
317             VV               = _mm_macc_pd(vfeps,Fp,Y);
318             vvdw6            = _mm_mul_pd(c6_00,VV);
319             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
320             fvdw6            = _mm_mul_pd(c6_00,FF);
321
322             /* CUBIC SPLINE TABLE REPULSION */
323             vfitab           = _mm_add_epi32(vfitab,ifour);
324             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
325             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
326             GMX_MM_TRANSPOSE2_PD(Y,F);
327             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
328             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
329             GMX_MM_TRANSPOSE2_PD(G,H);
330             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
331             VV               = _mm_macc_pd(vfeps,Fp,Y);
332             vvdw12           = _mm_mul_pd(c12_00,VV);
333             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
334             fvdw12           = _mm_mul_pd(c12_00,FF);
335             vvdw             = _mm_add_pd(vvdw12,vvdw6);
336             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
337
338             /* Update potential sum for this i atom from the interaction with this j atom. */
339             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
340
341             fscal            = fvdw;
342
343             /* Update vectorial force */
344             fix0             = _mm_macc_pd(dx00,fscal,fix0);
345             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
346             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
347             
348             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
349             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
350             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
351
352             /**************************
353              * CALCULATE INTERACTIONS *
354              **************************/
355
356             r11              = _mm_mul_pd(rsq11,rinv11);
357
358             /* Calculate table index by multiplying r with table scale and truncate to integer */
359             rt               = _mm_mul_pd(r11,vftabscale);
360             vfitab           = _mm_cvttpd_epi32(rt);
361 #ifdef __XOP__
362             vfeps            = _mm_frcz_pd(rt);
363 #else
364             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
365 #endif
366             twovfeps         = _mm_add_pd(vfeps,vfeps);
367             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
368
369             /* CUBIC SPLINE TABLE ELECTROSTATICS */
370             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
371             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
372             GMX_MM_TRANSPOSE2_PD(Y,F);
373             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
374             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
375             GMX_MM_TRANSPOSE2_PD(G,H);
376             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
377             VV               = _mm_macc_pd(vfeps,Fp,Y);
378             velec            = _mm_mul_pd(qq11,VV);
379             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
380             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
381
382             /* Update potential sum for this i atom from the interaction with this j atom. */
383             velecsum         = _mm_add_pd(velecsum,velec);
384
385             fscal            = felec;
386
387             /* Update vectorial force */
388             fix1             = _mm_macc_pd(dx11,fscal,fix1);
389             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
390             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
391             
392             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
393             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
394             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
395
396             /**************************
397              * CALCULATE INTERACTIONS *
398              **************************/
399
400             r12              = _mm_mul_pd(rsq12,rinv12);
401
402             /* Calculate table index by multiplying r with table scale and truncate to integer */
403             rt               = _mm_mul_pd(r12,vftabscale);
404             vfitab           = _mm_cvttpd_epi32(rt);
405 #ifdef __XOP__
406             vfeps            = _mm_frcz_pd(rt);
407 #else
408             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
409 #endif
410             twovfeps         = _mm_add_pd(vfeps,vfeps);
411             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
412
413             /* CUBIC SPLINE TABLE ELECTROSTATICS */
414             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
415             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
416             GMX_MM_TRANSPOSE2_PD(Y,F);
417             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
418             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
419             GMX_MM_TRANSPOSE2_PD(G,H);
420             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
421             VV               = _mm_macc_pd(vfeps,Fp,Y);
422             velec            = _mm_mul_pd(qq12,VV);
423             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
424             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
425
426             /* Update potential sum for this i atom from the interaction with this j atom. */
427             velecsum         = _mm_add_pd(velecsum,velec);
428
429             fscal            = felec;
430
431             /* Update vectorial force */
432             fix1             = _mm_macc_pd(dx12,fscal,fix1);
433             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
434             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
435             
436             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
437             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
438             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
439
440             /**************************
441              * CALCULATE INTERACTIONS *
442              **************************/
443
444             r13              = _mm_mul_pd(rsq13,rinv13);
445
446             /* Calculate table index by multiplying r with table scale and truncate to integer */
447             rt               = _mm_mul_pd(r13,vftabscale);
448             vfitab           = _mm_cvttpd_epi32(rt);
449 #ifdef __XOP__
450             vfeps            = _mm_frcz_pd(rt);
451 #else
452             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
453 #endif
454             twovfeps         = _mm_add_pd(vfeps,vfeps);
455             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
456
457             /* CUBIC SPLINE TABLE ELECTROSTATICS */
458             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
459             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
460             GMX_MM_TRANSPOSE2_PD(Y,F);
461             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
462             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
463             GMX_MM_TRANSPOSE2_PD(G,H);
464             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
465             VV               = _mm_macc_pd(vfeps,Fp,Y);
466             velec            = _mm_mul_pd(qq13,VV);
467             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
468             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
469
470             /* Update potential sum for this i atom from the interaction with this j atom. */
471             velecsum         = _mm_add_pd(velecsum,velec);
472
473             fscal            = felec;
474
475             /* Update vectorial force */
476             fix1             = _mm_macc_pd(dx13,fscal,fix1);
477             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
478             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
479             
480             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
481             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
482             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
483
484             /**************************
485              * CALCULATE INTERACTIONS *
486              **************************/
487
488             r21              = _mm_mul_pd(rsq21,rinv21);
489
490             /* Calculate table index by multiplying r with table scale and truncate to integer */
491             rt               = _mm_mul_pd(r21,vftabscale);
492             vfitab           = _mm_cvttpd_epi32(rt);
493 #ifdef __XOP__
494             vfeps            = _mm_frcz_pd(rt);
495 #else
496             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
497 #endif
498             twovfeps         = _mm_add_pd(vfeps,vfeps);
499             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
500
501             /* CUBIC SPLINE TABLE ELECTROSTATICS */
502             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
503             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
504             GMX_MM_TRANSPOSE2_PD(Y,F);
505             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
506             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
507             GMX_MM_TRANSPOSE2_PD(G,H);
508             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
509             VV               = _mm_macc_pd(vfeps,Fp,Y);
510             velec            = _mm_mul_pd(qq21,VV);
511             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
512             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
513
514             /* Update potential sum for this i atom from the interaction with this j atom. */
515             velecsum         = _mm_add_pd(velecsum,velec);
516
517             fscal            = felec;
518
519             /* Update vectorial force */
520             fix2             = _mm_macc_pd(dx21,fscal,fix2);
521             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
522             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
523             
524             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
525             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
526             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
527
528             /**************************
529              * CALCULATE INTERACTIONS *
530              **************************/
531
532             r22              = _mm_mul_pd(rsq22,rinv22);
533
534             /* Calculate table index by multiplying r with table scale and truncate to integer */
535             rt               = _mm_mul_pd(r22,vftabscale);
536             vfitab           = _mm_cvttpd_epi32(rt);
537 #ifdef __XOP__
538             vfeps            = _mm_frcz_pd(rt);
539 #else
540             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
541 #endif
542             twovfeps         = _mm_add_pd(vfeps,vfeps);
543             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
544
545             /* CUBIC SPLINE TABLE ELECTROSTATICS */
546             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
547             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
548             GMX_MM_TRANSPOSE2_PD(Y,F);
549             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
550             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
551             GMX_MM_TRANSPOSE2_PD(G,H);
552             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
553             VV               = _mm_macc_pd(vfeps,Fp,Y);
554             velec            = _mm_mul_pd(qq22,VV);
555             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
556             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
557
558             /* Update potential sum for this i atom from the interaction with this j atom. */
559             velecsum         = _mm_add_pd(velecsum,velec);
560
561             fscal            = felec;
562
563             /* Update vectorial force */
564             fix2             = _mm_macc_pd(dx22,fscal,fix2);
565             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
566             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
567             
568             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
569             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
570             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
571
572             /**************************
573              * CALCULATE INTERACTIONS *
574              **************************/
575
576             r23              = _mm_mul_pd(rsq23,rinv23);
577
578             /* Calculate table index by multiplying r with table scale and truncate to integer */
579             rt               = _mm_mul_pd(r23,vftabscale);
580             vfitab           = _mm_cvttpd_epi32(rt);
581 #ifdef __XOP__
582             vfeps            = _mm_frcz_pd(rt);
583 #else
584             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
585 #endif
586             twovfeps         = _mm_add_pd(vfeps,vfeps);
587             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
588
589             /* CUBIC SPLINE TABLE ELECTROSTATICS */
590             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
591             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
592             GMX_MM_TRANSPOSE2_PD(Y,F);
593             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
594             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
595             GMX_MM_TRANSPOSE2_PD(G,H);
596             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
597             VV               = _mm_macc_pd(vfeps,Fp,Y);
598             velec            = _mm_mul_pd(qq23,VV);
599             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
600             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
601
602             /* Update potential sum for this i atom from the interaction with this j atom. */
603             velecsum         = _mm_add_pd(velecsum,velec);
604
605             fscal            = felec;
606
607             /* Update vectorial force */
608             fix2             = _mm_macc_pd(dx23,fscal,fix2);
609             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
610             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
611             
612             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
613             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
614             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
615
616             /**************************
617              * CALCULATE INTERACTIONS *
618              **************************/
619
620             r31              = _mm_mul_pd(rsq31,rinv31);
621
622             /* Calculate table index by multiplying r with table scale and truncate to integer */
623             rt               = _mm_mul_pd(r31,vftabscale);
624             vfitab           = _mm_cvttpd_epi32(rt);
625 #ifdef __XOP__
626             vfeps            = _mm_frcz_pd(rt);
627 #else
628             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
629 #endif
630             twovfeps         = _mm_add_pd(vfeps,vfeps);
631             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
632
633             /* CUBIC SPLINE TABLE ELECTROSTATICS */
634             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
635             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
636             GMX_MM_TRANSPOSE2_PD(Y,F);
637             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
638             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
639             GMX_MM_TRANSPOSE2_PD(G,H);
640             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
641             VV               = _mm_macc_pd(vfeps,Fp,Y);
642             velec            = _mm_mul_pd(qq31,VV);
643             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
644             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
645
646             /* Update potential sum for this i atom from the interaction with this j atom. */
647             velecsum         = _mm_add_pd(velecsum,velec);
648
649             fscal            = felec;
650
651             /* Update vectorial force */
652             fix3             = _mm_macc_pd(dx31,fscal,fix3);
653             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
654             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
655             
656             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
657             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
658             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
659
660             /**************************
661              * CALCULATE INTERACTIONS *
662              **************************/
663
664             r32              = _mm_mul_pd(rsq32,rinv32);
665
666             /* Calculate table index by multiplying r with table scale and truncate to integer */
667             rt               = _mm_mul_pd(r32,vftabscale);
668             vfitab           = _mm_cvttpd_epi32(rt);
669 #ifdef __XOP__
670             vfeps            = _mm_frcz_pd(rt);
671 #else
672             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
673 #endif
674             twovfeps         = _mm_add_pd(vfeps,vfeps);
675             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
676
677             /* CUBIC SPLINE TABLE ELECTROSTATICS */
678             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
679             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
680             GMX_MM_TRANSPOSE2_PD(Y,F);
681             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
682             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
683             GMX_MM_TRANSPOSE2_PD(G,H);
684             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
685             VV               = _mm_macc_pd(vfeps,Fp,Y);
686             velec            = _mm_mul_pd(qq32,VV);
687             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
688             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
689
690             /* Update potential sum for this i atom from the interaction with this j atom. */
691             velecsum         = _mm_add_pd(velecsum,velec);
692
693             fscal            = felec;
694
695             /* Update vectorial force */
696             fix3             = _mm_macc_pd(dx32,fscal,fix3);
697             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
698             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
699             
700             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
701             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
702             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
703
704             /**************************
705              * CALCULATE INTERACTIONS *
706              **************************/
707
708             r33              = _mm_mul_pd(rsq33,rinv33);
709
710             /* Calculate table index by multiplying r with table scale and truncate to integer */
711             rt               = _mm_mul_pd(r33,vftabscale);
712             vfitab           = _mm_cvttpd_epi32(rt);
713 #ifdef __XOP__
714             vfeps            = _mm_frcz_pd(rt);
715 #else
716             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
717 #endif
718             twovfeps         = _mm_add_pd(vfeps,vfeps);
719             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
720
721             /* CUBIC SPLINE TABLE ELECTROSTATICS */
722             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
723             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
724             GMX_MM_TRANSPOSE2_PD(Y,F);
725             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
726             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
727             GMX_MM_TRANSPOSE2_PD(G,H);
728             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
729             VV               = _mm_macc_pd(vfeps,Fp,Y);
730             velec            = _mm_mul_pd(qq33,VV);
731             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
732             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
733
734             /* Update potential sum for this i atom from the interaction with this j atom. */
735             velecsum         = _mm_add_pd(velecsum,velec);
736
737             fscal            = felec;
738
739             /* Update vectorial force */
740             fix3             = _mm_macc_pd(dx33,fscal,fix3);
741             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
742             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
743             
744             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
745             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
746             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
747
748             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
749
750             /* Inner loop uses 476 flops */
751         }
752
753         if(jidx<j_index_end)
754         {
755
756             jnrA             = jjnr[jidx];
757             j_coord_offsetA  = DIM*jnrA;
758
759             /* load j atom coordinates */
760             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
761                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
762                                               &jy2,&jz2,&jx3,&jy3,&jz3);
763
764             /* Calculate displacement vector */
765             dx00             = _mm_sub_pd(ix0,jx0);
766             dy00             = _mm_sub_pd(iy0,jy0);
767             dz00             = _mm_sub_pd(iz0,jz0);
768             dx11             = _mm_sub_pd(ix1,jx1);
769             dy11             = _mm_sub_pd(iy1,jy1);
770             dz11             = _mm_sub_pd(iz1,jz1);
771             dx12             = _mm_sub_pd(ix1,jx2);
772             dy12             = _mm_sub_pd(iy1,jy2);
773             dz12             = _mm_sub_pd(iz1,jz2);
774             dx13             = _mm_sub_pd(ix1,jx3);
775             dy13             = _mm_sub_pd(iy1,jy3);
776             dz13             = _mm_sub_pd(iz1,jz3);
777             dx21             = _mm_sub_pd(ix2,jx1);
778             dy21             = _mm_sub_pd(iy2,jy1);
779             dz21             = _mm_sub_pd(iz2,jz1);
780             dx22             = _mm_sub_pd(ix2,jx2);
781             dy22             = _mm_sub_pd(iy2,jy2);
782             dz22             = _mm_sub_pd(iz2,jz2);
783             dx23             = _mm_sub_pd(ix2,jx3);
784             dy23             = _mm_sub_pd(iy2,jy3);
785             dz23             = _mm_sub_pd(iz2,jz3);
786             dx31             = _mm_sub_pd(ix3,jx1);
787             dy31             = _mm_sub_pd(iy3,jy1);
788             dz31             = _mm_sub_pd(iz3,jz1);
789             dx32             = _mm_sub_pd(ix3,jx2);
790             dy32             = _mm_sub_pd(iy3,jy2);
791             dz32             = _mm_sub_pd(iz3,jz2);
792             dx33             = _mm_sub_pd(ix3,jx3);
793             dy33             = _mm_sub_pd(iy3,jy3);
794             dz33             = _mm_sub_pd(iz3,jz3);
795
796             /* Calculate squared distance and things based on it */
797             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
798             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
799             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
800             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
801             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
802             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
803             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
804             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
805             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
806             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
807
808             rinv00           = gmx_mm_invsqrt_pd(rsq00);
809             rinv11           = gmx_mm_invsqrt_pd(rsq11);
810             rinv12           = gmx_mm_invsqrt_pd(rsq12);
811             rinv13           = gmx_mm_invsqrt_pd(rsq13);
812             rinv21           = gmx_mm_invsqrt_pd(rsq21);
813             rinv22           = gmx_mm_invsqrt_pd(rsq22);
814             rinv23           = gmx_mm_invsqrt_pd(rsq23);
815             rinv31           = gmx_mm_invsqrt_pd(rsq31);
816             rinv32           = gmx_mm_invsqrt_pd(rsq32);
817             rinv33           = gmx_mm_invsqrt_pd(rsq33);
818
819             fjx0             = _mm_setzero_pd();
820             fjy0             = _mm_setzero_pd();
821             fjz0             = _mm_setzero_pd();
822             fjx1             = _mm_setzero_pd();
823             fjy1             = _mm_setzero_pd();
824             fjz1             = _mm_setzero_pd();
825             fjx2             = _mm_setzero_pd();
826             fjy2             = _mm_setzero_pd();
827             fjz2             = _mm_setzero_pd();
828             fjx3             = _mm_setzero_pd();
829             fjy3             = _mm_setzero_pd();
830             fjz3             = _mm_setzero_pd();
831
832             /**************************
833              * CALCULATE INTERACTIONS *
834              **************************/
835
836             r00              = _mm_mul_pd(rsq00,rinv00);
837
838             /* Calculate table index by multiplying r with table scale and truncate to integer */
839             rt               = _mm_mul_pd(r00,vftabscale);
840             vfitab           = _mm_cvttpd_epi32(rt);
841 #ifdef __XOP__
842             vfeps            = _mm_frcz_pd(rt);
843 #else
844             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
845 #endif
846             twovfeps         = _mm_add_pd(vfeps,vfeps);
847             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
848
849             /* CUBIC SPLINE TABLE DISPERSION */
850             vfitab           = _mm_add_epi32(vfitab,ifour);
851             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
852             F                = _mm_setzero_pd();
853             GMX_MM_TRANSPOSE2_PD(Y,F);
854             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
855             H                = _mm_setzero_pd();
856             GMX_MM_TRANSPOSE2_PD(G,H);
857             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
858             VV               = _mm_macc_pd(vfeps,Fp,Y);
859             vvdw6            = _mm_mul_pd(c6_00,VV);
860             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
861             fvdw6            = _mm_mul_pd(c6_00,FF);
862
863             /* CUBIC SPLINE TABLE REPULSION */
864             vfitab           = _mm_add_epi32(vfitab,ifour);
865             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
866             F                = _mm_setzero_pd();
867             GMX_MM_TRANSPOSE2_PD(Y,F);
868             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
869             H                = _mm_setzero_pd();
870             GMX_MM_TRANSPOSE2_PD(G,H);
871             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
872             VV               = _mm_macc_pd(vfeps,Fp,Y);
873             vvdw12           = _mm_mul_pd(c12_00,VV);
874             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
875             fvdw12           = _mm_mul_pd(c12_00,FF);
876             vvdw             = _mm_add_pd(vvdw12,vvdw6);
877             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
878
879             /* Update potential sum for this i atom from the interaction with this j atom. */
880             vvdw             = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
881             vvdwsum          = _mm_add_pd(vvdwsum,vvdw);
882
883             fscal            = fvdw;
884
885             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
886
887             /* Update vectorial force */
888             fix0             = _mm_macc_pd(dx00,fscal,fix0);
889             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
890             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
891             
892             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
893             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
894             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
895
896             /**************************
897              * CALCULATE INTERACTIONS *
898              **************************/
899
900             r11              = _mm_mul_pd(rsq11,rinv11);
901
902             /* Calculate table index by multiplying r with table scale and truncate to integer */
903             rt               = _mm_mul_pd(r11,vftabscale);
904             vfitab           = _mm_cvttpd_epi32(rt);
905 #ifdef __XOP__
906             vfeps            = _mm_frcz_pd(rt);
907 #else
908             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
909 #endif
910             twovfeps         = _mm_add_pd(vfeps,vfeps);
911             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
912
913             /* CUBIC SPLINE TABLE ELECTROSTATICS */
914             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
915             F                = _mm_setzero_pd();
916             GMX_MM_TRANSPOSE2_PD(Y,F);
917             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
918             H                = _mm_setzero_pd();
919             GMX_MM_TRANSPOSE2_PD(G,H);
920             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
921             VV               = _mm_macc_pd(vfeps,Fp,Y);
922             velec            = _mm_mul_pd(qq11,VV);
923             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
924             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
925
926             /* Update potential sum for this i atom from the interaction with this j atom. */
927             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
928             velecsum         = _mm_add_pd(velecsum,velec);
929
930             fscal            = felec;
931
932             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
933
934             /* Update vectorial force */
935             fix1             = _mm_macc_pd(dx11,fscal,fix1);
936             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
937             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
938             
939             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
940             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
941             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
942
943             /**************************
944              * CALCULATE INTERACTIONS *
945              **************************/
946
947             r12              = _mm_mul_pd(rsq12,rinv12);
948
949             /* Calculate table index by multiplying r with table scale and truncate to integer */
950             rt               = _mm_mul_pd(r12,vftabscale);
951             vfitab           = _mm_cvttpd_epi32(rt);
952 #ifdef __XOP__
953             vfeps            = _mm_frcz_pd(rt);
954 #else
955             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
956 #endif
957             twovfeps         = _mm_add_pd(vfeps,vfeps);
958             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
959
960             /* CUBIC SPLINE TABLE ELECTROSTATICS */
961             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
962             F                = _mm_setzero_pd();
963             GMX_MM_TRANSPOSE2_PD(Y,F);
964             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
965             H                = _mm_setzero_pd();
966             GMX_MM_TRANSPOSE2_PD(G,H);
967             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
968             VV               = _mm_macc_pd(vfeps,Fp,Y);
969             velec            = _mm_mul_pd(qq12,VV);
970             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
971             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
972
973             /* Update potential sum for this i atom from the interaction with this j atom. */
974             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
975             velecsum         = _mm_add_pd(velecsum,velec);
976
977             fscal            = felec;
978
979             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
980
981             /* Update vectorial force */
982             fix1             = _mm_macc_pd(dx12,fscal,fix1);
983             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
984             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
985             
986             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
987             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
988             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
989
990             /**************************
991              * CALCULATE INTERACTIONS *
992              **************************/
993
994             r13              = _mm_mul_pd(rsq13,rinv13);
995
996             /* Calculate table index by multiplying r with table scale and truncate to integer */
997             rt               = _mm_mul_pd(r13,vftabscale);
998             vfitab           = _mm_cvttpd_epi32(rt);
999 #ifdef __XOP__
1000             vfeps            = _mm_frcz_pd(rt);
1001 #else
1002             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1003 #endif
1004             twovfeps         = _mm_add_pd(vfeps,vfeps);
1005             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1006
1007             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1008             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1009             F                = _mm_setzero_pd();
1010             GMX_MM_TRANSPOSE2_PD(Y,F);
1011             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1012             H                = _mm_setzero_pd();
1013             GMX_MM_TRANSPOSE2_PD(G,H);
1014             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1015             VV               = _mm_macc_pd(vfeps,Fp,Y);
1016             velec            = _mm_mul_pd(qq13,VV);
1017             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1018             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1019
1020             /* Update potential sum for this i atom from the interaction with this j atom. */
1021             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1022             velecsum         = _mm_add_pd(velecsum,velec);
1023
1024             fscal            = felec;
1025
1026             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1027
1028             /* Update vectorial force */
1029             fix1             = _mm_macc_pd(dx13,fscal,fix1);
1030             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
1031             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
1032             
1033             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
1034             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
1035             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
1036
1037             /**************************
1038              * CALCULATE INTERACTIONS *
1039              **************************/
1040
1041             r21              = _mm_mul_pd(rsq21,rinv21);
1042
1043             /* Calculate table index by multiplying r with table scale and truncate to integer */
1044             rt               = _mm_mul_pd(r21,vftabscale);
1045             vfitab           = _mm_cvttpd_epi32(rt);
1046 #ifdef __XOP__
1047             vfeps            = _mm_frcz_pd(rt);
1048 #else
1049             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1050 #endif
1051             twovfeps         = _mm_add_pd(vfeps,vfeps);
1052             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1053
1054             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1055             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1056             F                = _mm_setzero_pd();
1057             GMX_MM_TRANSPOSE2_PD(Y,F);
1058             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1059             H                = _mm_setzero_pd();
1060             GMX_MM_TRANSPOSE2_PD(G,H);
1061             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1062             VV               = _mm_macc_pd(vfeps,Fp,Y);
1063             velec            = _mm_mul_pd(qq21,VV);
1064             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1065             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1066
1067             /* Update potential sum for this i atom from the interaction with this j atom. */
1068             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1069             velecsum         = _mm_add_pd(velecsum,velec);
1070
1071             fscal            = felec;
1072
1073             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1074
1075             /* Update vectorial force */
1076             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1077             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1078             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1079             
1080             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1081             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1082             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1083
1084             /**************************
1085              * CALCULATE INTERACTIONS *
1086              **************************/
1087
1088             r22              = _mm_mul_pd(rsq22,rinv22);
1089
1090             /* Calculate table index by multiplying r with table scale and truncate to integer */
1091             rt               = _mm_mul_pd(r22,vftabscale);
1092             vfitab           = _mm_cvttpd_epi32(rt);
1093 #ifdef __XOP__
1094             vfeps            = _mm_frcz_pd(rt);
1095 #else
1096             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1097 #endif
1098             twovfeps         = _mm_add_pd(vfeps,vfeps);
1099             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1100
1101             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1102             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1103             F                = _mm_setzero_pd();
1104             GMX_MM_TRANSPOSE2_PD(Y,F);
1105             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1106             H                = _mm_setzero_pd();
1107             GMX_MM_TRANSPOSE2_PD(G,H);
1108             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1109             VV               = _mm_macc_pd(vfeps,Fp,Y);
1110             velec            = _mm_mul_pd(qq22,VV);
1111             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1112             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1113
1114             /* Update potential sum for this i atom from the interaction with this j atom. */
1115             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1116             velecsum         = _mm_add_pd(velecsum,velec);
1117
1118             fscal            = felec;
1119
1120             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1121
1122             /* Update vectorial force */
1123             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1124             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1125             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1126             
1127             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1128             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1129             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1130
1131             /**************************
1132              * CALCULATE INTERACTIONS *
1133              **************************/
1134
1135             r23              = _mm_mul_pd(rsq23,rinv23);
1136
1137             /* Calculate table index by multiplying r with table scale and truncate to integer */
1138             rt               = _mm_mul_pd(r23,vftabscale);
1139             vfitab           = _mm_cvttpd_epi32(rt);
1140 #ifdef __XOP__
1141             vfeps            = _mm_frcz_pd(rt);
1142 #else
1143             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1144 #endif
1145             twovfeps         = _mm_add_pd(vfeps,vfeps);
1146             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1147
1148             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1149             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1150             F                = _mm_setzero_pd();
1151             GMX_MM_TRANSPOSE2_PD(Y,F);
1152             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1153             H                = _mm_setzero_pd();
1154             GMX_MM_TRANSPOSE2_PD(G,H);
1155             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1156             VV               = _mm_macc_pd(vfeps,Fp,Y);
1157             velec            = _mm_mul_pd(qq23,VV);
1158             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1159             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1160
1161             /* Update potential sum for this i atom from the interaction with this j atom. */
1162             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1163             velecsum         = _mm_add_pd(velecsum,velec);
1164
1165             fscal            = felec;
1166
1167             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1168
1169             /* Update vectorial force */
1170             fix2             = _mm_macc_pd(dx23,fscal,fix2);
1171             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
1172             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
1173             
1174             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
1175             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
1176             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
1177
1178             /**************************
1179              * CALCULATE INTERACTIONS *
1180              **************************/
1181
1182             r31              = _mm_mul_pd(rsq31,rinv31);
1183
1184             /* Calculate table index by multiplying r with table scale and truncate to integer */
1185             rt               = _mm_mul_pd(r31,vftabscale);
1186             vfitab           = _mm_cvttpd_epi32(rt);
1187 #ifdef __XOP__
1188             vfeps            = _mm_frcz_pd(rt);
1189 #else
1190             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1191 #endif
1192             twovfeps         = _mm_add_pd(vfeps,vfeps);
1193             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1194
1195             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1196             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1197             F                = _mm_setzero_pd();
1198             GMX_MM_TRANSPOSE2_PD(Y,F);
1199             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1200             H                = _mm_setzero_pd();
1201             GMX_MM_TRANSPOSE2_PD(G,H);
1202             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1203             VV               = _mm_macc_pd(vfeps,Fp,Y);
1204             velec            = _mm_mul_pd(qq31,VV);
1205             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1206             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1207
1208             /* Update potential sum for this i atom from the interaction with this j atom. */
1209             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1210             velecsum         = _mm_add_pd(velecsum,velec);
1211
1212             fscal            = felec;
1213
1214             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1215
1216             /* Update vectorial force */
1217             fix3             = _mm_macc_pd(dx31,fscal,fix3);
1218             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
1219             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
1220             
1221             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
1222             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
1223             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
1224
1225             /**************************
1226              * CALCULATE INTERACTIONS *
1227              **************************/
1228
1229             r32              = _mm_mul_pd(rsq32,rinv32);
1230
1231             /* Calculate table index by multiplying r with table scale and truncate to integer */
1232             rt               = _mm_mul_pd(r32,vftabscale);
1233             vfitab           = _mm_cvttpd_epi32(rt);
1234 #ifdef __XOP__
1235             vfeps            = _mm_frcz_pd(rt);
1236 #else
1237             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1238 #endif
1239             twovfeps         = _mm_add_pd(vfeps,vfeps);
1240             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1241
1242             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1243             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1244             F                = _mm_setzero_pd();
1245             GMX_MM_TRANSPOSE2_PD(Y,F);
1246             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1247             H                = _mm_setzero_pd();
1248             GMX_MM_TRANSPOSE2_PD(G,H);
1249             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1250             VV               = _mm_macc_pd(vfeps,Fp,Y);
1251             velec            = _mm_mul_pd(qq32,VV);
1252             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1253             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1254
1255             /* Update potential sum for this i atom from the interaction with this j atom. */
1256             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1257             velecsum         = _mm_add_pd(velecsum,velec);
1258
1259             fscal            = felec;
1260
1261             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1262
1263             /* Update vectorial force */
1264             fix3             = _mm_macc_pd(dx32,fscal,fix3);
1265             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
1266             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
1267             
1268             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
1269             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
1270             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
1271
1272             /**************************
1273              * CALCULATE INTERACTIONS *
1274              **************************/
1275
1276             r33              = _mm_mul_pd(rsq33,rinv33);
1277
1278             /* Calculate table index by multiplying r with table scale and truncate to integer */
1279             rt               = _mm_mul_pd(r33,vftabscale);
1280             vfitab           = _mm_cvttpd_epi32(rt);
1281 #ifdef __XOP__
1282             vfeps            = _mm_frcz_pd(rt);
1283 #else
1284             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1285 #endif
1286             twovfeps         = _mm_add_pd(vfeps,vfeps);
1287             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1288
1289             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1290             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1291             F                = _mm_setzero_pd();
1292             GMX_MM_TRANSPOSE2_PD(Y,F);
1293             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1294             H                = _mm_setzero_pd();
1295             GMX_MM_TRANSPOSE2_PD(G,H);
1296             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1297             VV               = _mm_macc_pd(vfeps,Fp,Y);
1298             velec            = _mm_mul_pd(qq33,VV);
1299             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1300             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1301
1302             /* Update potential sum for this i atom from the interaction with this j atom. */
1303             velec            = _mm_unpacklo_pd(velec,_mm_setzero_pd());
1304             velecsum         = _mm_add_pd(velecsum,velec);
1305
1306             fscal            = felec;
1307
1308             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1309
1310             /* Update vectorial force */
1311             fix3             = _mm_macc_pd(dx33,fscal,fix3);
1312             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
1313             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
1314             
1315             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
1316             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
1317             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
1318
1319             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1320
1321             /* Inner loop uses 476 flops */
1322         }
1323
1324         /* End of innermost loop */
1325
1326         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1327                                               f+i_coord_offset,fshift+i_shift_offset);
1328
1329         ggid                        = gid[iidx];
1330         /* Update potential energies */
1331         gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1332         gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1333
1334         /* Increment number of inner iterations */
1335         inneriter                  += j_index_end - j_index_start;
1336
1337         /* Outer loop uses 26 flops */
1338     }
1339
1340     /* Increment number of outer iterations */
1341     outeriter        += nri;
1342
1343     /* Update outer/inner flops */
1344
1345     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*476);
1346 }
1347 /*
1348  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_double
1349  * Electrostatics interaction: CubicSplineTable
1350  * VdW interaction:            CubicSplineTable
1351  * Geometry:                   Water4-Water4
1352  * Calculate force/pot:        Force
1353  */
1354 void
1355 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_double
1356                     (t_nblist                    * gmx_restrict       nlist,
1357                      rvec                        * gmx_restrict          xx,
1358                      rvec                        * gmx_restrict          ff,
1359                      t_forcerec                  * gmx_restrict          fr,
1360                      t_mdatoms                   * gmx_restrict     mdatoms,
1361                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1362                      t_nrnb                      * gmx_restrict        nrnb)
1363 {
1364     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1365      * just 0 for non-waters.
1366      * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
1367      * jnr indices corresponding to data put in the four positions in the SIMD register.
1368      */
1369     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1370     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1371     int              jnrA,jnrB;
1372     int              j_coord_offsetA,j_coord_offsetB;
1373     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1374     real             rcutoff_scalar;
1375     real             *shiftvec,*fshift,*x,*f;
1376     __m128d          tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1377     int              vdwioffset0;
1378     __m128d          ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1379     int              vdwioffset1;
1380     __m128d          ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1381     int              vdwioffset2;
1382     __m128d          ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1383     int              vdwioffset3;
1384     __m128d          ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1385     int              vdwjidx0A,vdwjidx0B;
1386     __m128d          jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1387     int              vdwjidx1A,vdwjidx1B;
1388     __m128d          jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1389     int              vdwjidx2A,vdwjidx2B;
1390     __m128d          jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1391     int              vdwjidx3A,vdwjidx3B;
1392     __m128d          jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1393     __m128d          dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1394     __m128d          dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1395     __m128d          dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1396     __m128d          dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1397     __m128d          dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1398     __m128d          dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1399     __m128d          dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1400     __m128d          dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1401     __m128d          dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1402     __m128d          dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1403     __m128d          velec,felec,velecsum,facel,crf,krf,krf2;
1404     real             *charge;
1405     int              nvdwtype;
1406     __m128d          rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1407     int              *vdwtype;
1408     real             *vdwparam;
1409     __m128d          one_sixth   = _mm_set1_pd(1.0/6.0);
1410     __m128d          one_twelfth = _mm_set1_pd(1.0/12.0);
1411     __m128i          vfitab;
1412     __m128i          ifour       = _mm_set1_epi32(4);
1413     __m128d          rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
1414     real             *vftab;
1415     __m128d          dummy_mask,cutoff_mask;
1416     __m128d          signbit   = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
1417     __m128d          one     = _mm_set1_pd(1.0);
1418     __m128d          two     = _mm_set1_pd(2.0);
1419     x                = xx[0];
1420     f                = ff[0];
1421
1422     nri              = nlist->nri;
1423     iinr             = nlist->iinr;
1424     jindex           = nlist->jindex;
1425     jjnr             = nlist->jjnr;
1426     shiftidx         = nlist->shift;
1427     gid              = nlist->gid;
1428     shiftvec         = fr->shift_vec[0];
1429     fshift           = fr->fshift[0];
1430     facel            = _mm_set1_pd(fr->epsfac);
1431     charge           = mdatoms->chargeA;
1432     nvdwtype         = fr->ntype;
1433     vdwparam         = fr->nbfp;
1434     vdwtype          = mdatoms->typeA;
1435
1436     vftab            = kernel_data->table_elec_vdw->data;
1437     vftabscale       = _mm_set1_pd(kernel_data->table_elec_vdw->scale);
1438
1439     /* Setup water-specific parameters */
1440     inr              = nlist->iinr[0];
1441     iq1              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
1442     iq2              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
1443     iq3              = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
1444     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1445
1446     jq1              = _mm_set1_pd(charge[inr+1]);
1447     jq2              = _mm_set1_pd(charge[inr+2]);
1448     jq3              = _mm_set1_pd(charge[inr+3]);
1449     vdwjidx0A        = 2*vdwtype[inr+0];
1450     c6_00            = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A]);
1451     c12_00           = _mm_set1_pd(vdwparam[vdwioffset0+vdwjidx0A+1]);
1452     qq11             = _mm_mul_pd(iq1,jq1);
1453     qq12             = _mm_mul_pd(iq1,jq2);
1454     qq13             = _mm_mul_pd(iq1,jq3);
1455     qq21             = _mm_mul_pd(iq2,jq1);
1456     qq22             = _mm_mul_pd(iq2,jq2);
1457     qq23             = _mm_mul_pd(iq2,jq3);
1458     qq31             = _mm_mul_pd(iq3,jq1);
1459     qq32             = _mm_mul_pd(iq3,jq2);
1460     qq33             = _mm_mul_pd(iq3,jq3);
1461
1462     /* Avoid stupid compiler warnings */
1463     jnrA = jnrB = 0;
1464     j_coord_offsetA = 0;
1465     j_coord_offsetB = 0;
1466
1467     outeriter        = 0;
1468     inneriter        = 0;
1469
1470     /* Start outer loop over neighborlists */
1471     for(iidx=0; iidx<nri; iidx++)
1472     {
1473         /* Load shift vector for this list */
1474         i_shift_offset   = DIM*shiftidx[iidx];
1475
1476         /* Load limits for loop over neighbors */
1477         j_index_start    = jindex[iidx];
1478         j_index_end      = jindex[iidx+1];
1479
1480         /* Get outer coordinate index */
1481         inr              = iinr[iidx];
1482         i_coord_offset   = DIM*inr;
1483
1484         /* Load i particle coords and add shift vector */
1485         gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1486                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1487
1488         fix0             = _mm_setzero_pd();
1489         fiy0             = _mm_setzero_pd();
1490         fiz0             = _mm_setzero_pd();
1491         fix1             = _mm_setzero_pd();
1492         fiy1             = _mm_setzero_pd();
1493         fiz1             = _mm_setzero_pd();
1494         fix2             = _mm_setzero_pd();
1495         fiy2             = _mm_setzero_pd();
1496         fiz2             = _mm_setzero_pd();
1497         fix3             = _mm_setzero_pd();
1498         fiy3             = _mm_setzero_pd();
1499         fiz3             = _mm_setzero_pd();
1500
1501         /* Start inner kernel loop */
1502         for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
1503         {
1504
1505             /* Get j neighbor index, and coordinate index */
1506             jnrA             = jjnr[jidx];
1507             jnrB             = jjnr[jidx+1];
1508             j_coord_offsetA  = DIM*jnrA;
1509             j_coord_offsetB  = DIM*jnrB;
1510
1511             /* load j atom coordinates */
1512             gmx_mm_load_4rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1513                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1514                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1515
1516             /* Calculate displacement vector */
1517             dx00             = _mm_sub_pd(ix0,jx0);
1518             dy00             = _mm_sub_pd(iy0,jy0);
1519             dz00             = _mm_sub_pd(iz0,jz0);
1520             dx11             = _mm_sub_pd(ix1,jx1);
1521             dy11             = _mm_sub_pd(iy1,jy1);
1522             dz11             = _mm_sub_pd(iz1,jz1);
1523             dx12             = _mm_sub_pd(ix1,jx2);
1524             dy12             = _mm_sub_pd(iy1,jy2);
1525             dz12             = _mm_sub_pd(iz1,jz2);
1526             dx13             = _mm_sub_pd(ix1,jx3);
1527             dy13             = _mm_sub_pd(iy1,jy3);
1528             dz13             = _mm_sub_pd(iz1,jz3);
1529             dx21             = _mm_sub_pd(ix2,jx1);
1530             dy21             = _mm_sub_pd(iy2,jy1);
1531             dz21             = _mm_sub_pd(iz2,jz1);
1532             dx22             = _mm_sub_pd(ix2,jx2);
1533             dy22             = _mm_sub_pd(iy2,jy2);
1534             dz22             = _mm_sub_pd(iz2,jz2);
1535             dx23             = _mm_sub_pd(ix2,jx3);
1536             dy23             = _mm_sub_pd(iy2,jy3);
1537             dz23             = _mm_sub_pd(iz2,jz3);
1538             dx31             = _mm_sub_pd(ix3,jx1);
1539             dy31             = _mm_sub_pd(iy3,jy1);
1540             dz31             = _mm_sub_pd(iz3,jz1);
1541             dx32             = _mm_sub_pd(ix3,jx2);
1542             dy32             = _mm_sub_pd(iy3,jy2);
1543             dz32             = _mm_sub_pd(iz3,jz2);
1544             dx33             = _mm_sub_pd(ix3,jx3);
1545             dy33             = _mm_sub_pd(iy3,jy3);
1546             dz33             = _mm_sub_pd(iz3,jz3);
1547
1548             /* Calculate squared distance and things based on it */
1549             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1550             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
1551             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
1552             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
1553             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
1554             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
1555             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
1556             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
1557             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
1558             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
1559
1560             rinv00           = gmx_mm_invsqrt_pd(rsq00);
1561             rinv11           = gmx_mm_invsqrt_pd(rsq11);
1562             rinv12           = gmx_mm_invsqrt_pd(rsq12);
1563             rinv13           = gmx_mm_invsqrt_pd(rsq13);
1564             rinv21           = gmx_mm_invsqrt_pd(rsq21);
1565             rinv22           = gmx_mm_invsqrt_pd(rsq22);
1566             rinv23           = gmx_mm_invsqrt_pd(rsq23);
1567             rinv31           = gmx_mm_invsqrt_pd(rsq31);
1568             rinv32           = gmx_mm_invsqrt_pd(rsq32);
1569             rinv33           = gmx_mm_invsqrt_pd(rsq33);
1570
1571             fjx0             = _mm_setzero_pd();
1572             fjy0             = _mm_setzero_pd();
1573             fjz0             = _mm_setzero_pd();
1574             fjx1             = _mm_setzero_pd();
1575             fjy1             = _mm_setzero_pd();
1576             fjz1             = _mm_setzero_pd();
1577             fjx2             = _mm_setzero_pd();
1578             fjy2             = _mm_setzero_pd();
1579             fjz2             = _mm_setzero_pd();
1580             fjx3             = _mm_setzero_pd();
1581             fjy3             = _mm_setzero_pd();
1582             fjz3             = _mm_setzero_pd();
1583
1584             /**************************
1585              * CALCULATE INTERACTIONS *
1586              **************************/
1587
1588             r00              = _mm_mul_pd(rsq00,rinv00);
1589
1590             /* Calculate table index by multiplying r with table scale and truncate to integer */
1591             rt               = _mm_mul_pd(r00,vftabscale);
1592             vfitab           = _mm_cvttpd_epi32(rt);
1593 #ifdef __XOP__
1594             vfeps            = _mm_frcz_pd(rt);
1595 #else
1596             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1597 #endif
1598             twovfeps         = _mm_add_pd(vfeps,vfeps);
1599             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1600
1601             /* CUBIC SPLINE TABLE DISPERSION */
1602             vfitab           = _mm_add_epi32(vfitab,ifour);
1603             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1604             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1605             GMX_MM_TRANSPOSE2_PD(Y,F);
1606             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1607             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1608             GMX_MM_TRANSPOSE2_PD(G,H);
1609             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1610             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1611             fvdw6            = _mm_mul_pd(c6_00,FF);
1612
1613             /* CUBIC SPLINE TABLE REPULSION */
1614             vfitab           = _mm_add_epi32(vfitab,ifour);
1615             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1616             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1617             GMX_MM_TRANSPOSE2_PD(Y,F);
1618             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1619             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1620             GMX_MM_TRANSPOSE2_PD(G,H);
1621             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1622             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1623             fvdw12           = _mm_mul_pd(c12_00,FF);
1624             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1625
1626             fscal            = fvdw;
1627
1628             /* Update vectorial force */
1629             fix0             = _mm_macc_pd(dx00,fscal,fix0);
1630             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
1631             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
1632             
1633             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
1634             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
1635             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
1636
1637             /**************************
1638              * CALCULATE INTERACTIONS *
1639              **************************/
1640
1641             r11              = _mm_mul_pd(rsq11,rinv11);
1642
1643             /* Calculate table index by multiplying r with table scale and truncate to integer */
1644             rt               = _mm_mul_pd(r11,vftabscale);
1645             vfitab           = _mm_cvttpd_epi32(rt);
1646 #ifdef __XOP__
1647             vfeps            = _mm_frcz_pd(rt);
1648 #else
1649             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1650 #endif
1651             twovfeps         = _mm_add_pd(vfeps,vfeps);
1652             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1653
1654             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1655             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1656             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1657             GMX_MM_TRANSPOSE2_PD(Y,F);
1658             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1659             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1660             GMX_MM_TRANSPOSE2_PD(G,H);
1661             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1662             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1663             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
1664
1665             fscal            = felec;
1666
1667             /* Update vectorial force */
1668             fix1             = _mm_macc_pd(dx11,fscal,fix1);
1669             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
1670             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
1671             
1672             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
1673             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
1674             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
1675
1676             /**************************
1677              * CALCULATE INTERACTIONS *
1678              **************************/
1679
1680             r12              = _mm_mul_pd(rsq12,rinv12);
1681
1682             /* Calculate table index by multiplying r with table scale and truncate to integer */
1683             rt               = _mm_mul_pd(r12,vftabscale);
1684             vfitab           = _mm_cvttpd_epi32(rt);
1685 #ifdef __XOP__
1686             vfeps            = _mm_frcz_pd(rt);
1687 #else
1688             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1689 #endif
1690             twovfeps         = _mm_add_pd(vfeps,vfeps);
1691             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1692
1693             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1694             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1695             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1696             GMX_MM_TRANSPOSE2_PD(Y,F);
1697             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1698             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1699             GMX_MM_TRANSPOSE2_PD(G,H);
1700             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1701             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1702             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
1703
1704             fscal            = felec;
1705
1706             /* Update vectorial force */
1707             fix1             = _mm_macc_pd(dx12,fscal,fix1);
1708             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
1709             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
1710             
1711             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
1712             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
1713             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
1714
1715             /**************************
1716              * CALCULATE INTERACTIONS *
1717              **************************/
1718
1719             r13              = _mm_mul_pd(rsq13,rinv13);
1720
1721             /* Calculate table index by multiplying r with table scale and truncate to integer */
1722             rt               = _mm_mul_pd(r13,vftabscale);
1723             vfitab           = _mm_cvttpd_epi32(rt);
1724 #ifdef __XOP__
1725             vfeps            = _mm_frcz_pd(rt);
1726 #else
1727             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1728 #endif
1729             twovfeps         = _mm_add_pd(vfeps,vfeps);
1730             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1731
1732             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1733             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1734             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1735             GMX_MM_TRANSPOSE2_PD(Y,F);
1736             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1737             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1738             GMX_MM_TRANSPOSE2_PD(G,H);
1739             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1740             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1741             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
1742
1743             fscal            = felec;
1744
1745             /* Update vectorial force */
1746             fix1             = _mm_macc_pd(dx13,fscal,fix1);
1747             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
1748             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
1749             
1750             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
1751             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
1752             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
1753
1754             /**************************
1755              * CALCULATE INTERACTIONS *
1756              **************************/
1757
1758             r21              = _mm_mul_pd(rsq21,rinv21);
1759
1760             /* Calculate table index by multiplying r with table scale and truncate to integer */
1761             rt               = _mm_mul_pd(r21,vftabscale);
1762             vfitab           = _mm_cvttpd_epi32(rt);
1763 #ifdef __XOP__
1764             vfeps            = _mm_frcz_pd(rt);
1765 #else
1766             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1767 #endif
1768             twovfeps         = _mm_add_pd(vfeps,vfeps);
1769             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1770
1771             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1772             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1773             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1774             GMX_MM_TRANSPOSE2_PD(Y,F);
1775             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1776             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1777             GMX_MM_TRANSPOSE2_PD(G,H);
1778             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1779             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1780             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
1781
1782             fscal            = felec;
1783
1784             /* Update vectorial force */
1785             fix2             = _mm_macc_pd(dx21,fscal,fix2);
1786             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
1787             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
1788             
1789             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
1790             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
1791             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
1792
1793             /**************************
1794              * CALCULATE INTERACTIONS *
1795              **************************/
1796
1797             r22              = _mm_mul_pd(rsq22,rinv22);
1798
1799             /* Calculate table index by multiplying r with table scale and truncate to integer */
1800             rt               = _mm_mul_pd(r22,vftabscale);
1801             vfitab           = _mm_cvttpd_epi32(rt);
1802 #ifdef __XOP__
1803             vfeps            = _mm_frcz_pd(rt);
1804 #else
1805             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1806 #endif
1807             twovfeps         = _mm_add_pd(vfeps,vfeps);
1808             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1809
1810             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1811             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1812             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1813             GMX_MM_TRANSPOSE2_PD(Y,F);
1814             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1815             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1816             GMX_MM_TRANSPOSE2_PD(G,H);
1817             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1818             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1819             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
1820
1821             fscal            = felec;
1822
1823             /* Update vectorial force */
1824             fix2             = _mm_macc_pd(dx22,fscal,fix2);
1825             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
1826             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
1827             
1828             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
1829             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
1830             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
1831
1832             /**************************
1833              * CALCULATE INTERACTIONS *
1834              **************************/
1835
1836             r23              = _mm_mul_pd(rsq23,rinv23);
1837
1838             /* Calculate table index by multiplying r with table scale and truncate to integer */
1839             rt               = _mm_mul_pd(r23,vftabscale);
1840             vfitab           = _mm_cvttpd_epi32(rt);
1841 #ifdef __XOP__
1842             vfeps            = _mm_frcz_pd(rt);
1843 #else
1844             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1845 #endif
1846             twovfeps         = _mm_add_pd(vfeps,vfeps);
1847             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1848
1849             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1850             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1851             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1852             GMX_MM_TRANSPOSE2_PD(Y,F);
1853             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1854             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1855             GMX_MM_TRANSPOSE2_PD(G,H);
1856             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1857             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1858             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
1859
1860             fscal            = felec;
1861
1862             /* Update vectorial force */
1863             fix2             = _mm_macc_pd(dx23,fscal,fix2);
1864             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
1865             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
1866             
1867             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
1868             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
1869             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
1870
1871             /**************************
1872              * CALCULATE INTERACTIONS *
1873              **************************/
1874
1875             r31              = _mm_mul_pd(rsq31,rinv31);
1876
1877             /* Calculate table index by multiplying r with table scale and truncate to integer */
1878             rt               = _mm_mul_pd(r31,vftabscale);
1879             vfitab           = _mm_cvttpd_epi32(rt);
1880 #ifdef __XOP__
1881             vfeps            = _mm_frcz_pd(rt);
1882 #else
1883             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1884 #endif
1885             twovfeps         = _mm_add_pd(vfeps,vfeps);
1886             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1887
1888             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1889             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1890             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1891             GMX_MM_TRANSPOSE2_PD(Y,F);
1892             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1893             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1894             GMX_MM_TRANSPOSE2_PD(G,H);
1895             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1896             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1897             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
1898
1899             fscal            = felec;
1900
1901             /* Update vectorial force */
1902             fix3             = _mm_macc_pd(dx31,fscal,fix3);
1903             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
1904             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
1905             
1906             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
1907             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
1908             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
1909
1910             /**************************
1911              * CALCULATE INTERACTIONS *
1912              **************************/
1913
1914             r32              = _mm_mul_pd(rsq32,rinv32);
1915
1916             /* Calculate table index by multiplying r with table scale and truncate to integer */
1917             rt               = _mm_mul_pd(r32,vftabscale);
1918             vfitab           = _mm_cvttpd_epi32(rt);
1919 #ifdef __XOP__
1920             vfeps            = _mm_frcz_pd(rt);
1921 #else
1922             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1923 #endif
1924             twovfeps         = _mm_add_pd(vfeps,vfeps);
1925             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1926
1927             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1928             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1929             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1930             GMX_MM_TRANSPOSE2_PD(Y,F);
1931             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1932             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1933             GMX_MM_TRANSPOSE2_PD(G,H);
1934             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1935             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1936             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
1937
1938             fscal            = felec;
1939
1940             /* Update vectorial force */
1941             fix3             = _mm_macc_pd(dx32,fscal,fix3);
1942             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
1943             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
1944             
1945             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
1946             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
1947             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
1948
1949             /**************************
1950              * CALCULATE INTERACTIONS *
1951              **************************/
1952
1953             r33              = _mm_mul_pd(rsq33,rinv33);
1954
1955             /* Calculate table index by multiplying r with table scale and truncate to integer */
1956             rt               = _mm_mul_pd(r33,vftabscale);
1957             vfitab           = _mm_cvttpd_epi32(rt);
1958 #ifdef __XOP__
1959             vfeps            = _mm_frcz_pd(rt);
1960 #else
1961             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1962 #endif
1963             twovfeps         = _mm_add_pd(vfeps,vfeps);
1964             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1965
1966             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1967             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1968             F                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1969             GMX_MM_TRANSPOSE2_PD(Y,F);
1970             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1971             H                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
1972             GMX_MM_TRANSPOSE2_PD(G,H);
1973             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
1974             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
1975             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
1976
1977             fscal            = felec;
1978
1979             /* Update vectorial force */
1980             fix3             = _mm_macc_pd(dx33,fscal,fix3);
1981             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
1982             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
1983             
1984             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
1985             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
1986             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
1987
1988             gmx_mm_decrement_4rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1989
1990             /* Inner loop uses 432 flops */
1991         }
1992
1993         if(jidx<j_index_end)
1994         {
1995
1996             jnrA             = jjnr[jidx];
1997             j_coord_offsetA  = DIM*jnrA;
1998
1999             /* load j atom coordinates */
2000             gmx_mm_load_4rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
2001                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2002                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2003
2004             /* Calculate displacement vector */
2005             dx00             = _mm_sub_pd(ix0,jx0);
2006             dy00             = _mm_sub_pd(iy0,jy0);
2007             dz00             = _mm_sub_pd(iz0,jz0);
2008             dx11             = _mm_sub_pd(ix1,jx1);
2009             dy11             = _mm_sub_pd(iy1,jy1);
2010             dz11             = _mm_sub_pd(iz1,jz1);
2011             dx12             = _mm_sub_pd(ix1,jx2);
2012             dy12             = _mm_sub_pd(iy1,jy2);
2013             dz12             = _mm_sub_pd(iz1,jz2);
2014             dx13             = _mm_sub_pd(ix1,jx3);
2015             dy13             = _mm_sub_pd(iy1,jy3);
2016             dz13             = _mm_sub_pd(iz1,jz3);
2017             dx21             = _mm_sub_pd(ix2,jx1);
2018             dy21             = _mm_sub_pd(iy2,jy1);
2019             dz21             = _mm_sub_pd(iz2,jz1);
2020             dx22             = _mm_sub_pd(ix2,jx2);
2021             dy22             = _mm_sub_pd(iy2,jy2);
2022             dz22             = _mm_sub_pd(iz2,jz2);
2023             dx23             = _mm_sub_pd(ix2,jx3);
2024             dy23             = _mm_sub_pd(iy2,jy3);
2025             dz23             = _mm_sub_pd(iz2,jz3);
2026             dx31             = _mm_sub_pd(ix3,jx1);
2027             dy31             = _mm_sub_pd(iy3,jy1);
2028             dz31             = _mm_sub_pd(iz3,jz1);
2029             dx32             = _mm_sub_pd(ix3,jx2);
2030             dy32             = _mm_sub_pd(iy3,jy2);
2031             dz32             = _mm_sub_pd(iz3,jz2);
2032             dx33             = _mm_sub_pd(ix3,jx3);
2033             dy33             = _mm_sub_pd(iy3,jy3);
2034             dz33             = _mm_sub_pd(iz3,jz3);
2035
2036             /* Calculate squared distance and things based on it */
2037             rsq00            = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
2038             rsq11            = gmx_mm_calc_rsq_pd(dx11,dy11,dz11);
2039             rsq12            = gmx_mm_calc_rsq_pd(dx12,dy12,dz12);
2040             rsq13            = gmx_mm_calc_rsq_pd(dx13,dy13,dz13);
2041             rsq21            = gmx_mm_calc_rsq_pd(dx21,dy21,dz21);
2042             rsq22            = gmx_mm_calc_rsq_pd(dx22,dy22,dz22);
2043             rsq23            = gmx_mm_calc_rsq_pd(dx23,dy23,dz23);
2044             rsq31            = gmx_mm_calc_rsq_pd(dx31,dy31,dz31);
2045             rsq32            = gmx_mm_calc_rsq_pd(dx32,dy32,dz32);
2046             rsq33            = gmx_mm_calc_rsq_pd(dx33,dy33,dz33);
2047
2048             rinv00           = gmx_mm_invsqrt_pd(rsq00);
2049             rinv11           = gmx_mm_invsqrt_pd(rsq11);
2050             rinv12           = gmx_mm_invsqrt_pd(rsq12);
2051             rinv13           = gmx_mm_invsqrt_pd(rsq13);
2052             rinv21           = gmx_mm_invsqrt_pd(rsq21);
2053             rinv22           = gmx_mm_invsqrt_pd(rsq22);
2054             rinv23           = gmx_mm_invsqrt_pd(rsq23);
2055             rinv31           = gmx_mm_invsqrt_pd(rsq31);
2056             rinv32           = gmx_mm_invsqrt_pd(rsq32);
2057             rinv33           = gmx_mm_invsqrt_pd(rsq33);
2058
2059             fjx0             = _mm_setzero_pd();
2060             fjy0             = _mm_setzero_pd();
2061             fjz0             = _mm_setzero_pd();
2062             fjx1             = _mm_setzero_pd();
2063             fjy1             = _mm_setzero_pd();
2064             fjz1             = _mm_setzero_pd();
2065             fjx2             = _mm_setzero_pd();
2066             fjy2             = _mm_setzero_pd();
2067             fjz2             = _mm_setzero_pd();
2068             fjx3             = _mm_setzero_pd();
2069             fjy3             = _mm_setzero_pd();
2070             fjz3             = _mm_setzero_pd();
2071
2072             /**************************
2073              * CALCULATE INTERACTIONS *
2074              **************************/
2075
2076             r00              = _mm_mul_pd(rsq00,rinv00);
2077
2078             /* Calculate table index by multiplying r with table scale and truncate to integer */
2079             rt               = _mm_mul_pd(r00,vftabscale);
2080             vfitab           = _mm_cvttpd_epi32(rt);
2081 #ifdef __XOP__
2082             vfeps            = _mm_frcz_pd(rt);
2083 #else
2084             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2085 #endif
2086             twovfeps         = _mm_add_pd(vfeps,vfeps);
2087             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2088
2089             /* CUBIC SPLINE TABLE DISPERSION */
2090             vfitab           = _mm_add_epi32(vfitab,ifour);
2091             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2092             F                = _mm_setzero_pd();
2093             GMX_MM_TRANSPOSE2_PD(Y,F);
2094             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2095             H                = _mm_setzero_pd();
2096             GMX_MM_TRANSPOSE2_PD(G,H);
2097             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
2098             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
2099             fvdw6            = _mm_mul_pd(c6_00,FF);
2100
2101             /* CUBIC SPLINE TABLE REPULSION */
2102             vfitab           = _mm_add_epi32(vfitab,ifour);
2103             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2104             F                = _mm_setzero_pd();
2105             GMX_MM_TRANSPOSE2_PD(Y,F);
2106             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2107             H                = _mm_setzero_pd();
2108             GMX_MM_TRANSPOSE2_PD(G,H);
2109             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
2110             FF               = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
2111             fvdw12           = _mm_mul_pd(c12_00,FF);
2112             fvdw             = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
2113
2114             fscal            = fvdw;
2115
2116             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2117
2118             /* Update vectorial force */
2119             fix0             = _mm_macc_pd(dx00,fscal,fix0);
2120             fiy0             = _mm_macc_pd(dy00,fscal,fiy0);
2121             fiz0             = _mm_macc_pd(dz00,fscal,fiz0);
2122             
2123             fjx0             = _mm_macc_pd(dx00,fscal,fjx0);
2124             fjy0             = _mm_macc_pd(dy00,fscal,fjy0);
2125             fjz0             = _mm_macc_pd(dz00,fscal,fjz0);
2126
2127             /**************************
2128              * CALCULATE INTERACTIONS *
2129              **************************/
2130
2131             r11              = _mm_mul_pd(rsq11,rinv11);
2132
2133             /* Calculate table index by multiplying r with table scale and truncate to integer */
2134             rt               = _mm_mul_pd(r11,vftabscale);
2135             vfitab           = _mm_cvttpd_epi32(rt);
2136 #ifdef __XOP__
2137             vfeps            = _mm_frcz_pd(rt);
2138 #else
2139             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2140 #endif
2141             twovfeps         = _mm_add_pd(vfeps,vfeps);
2142             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2143
2144             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2145             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2146             F                = _mm_setzero_pd();
2147             GMX_MM_TRANSPOSE2_PD(Y,F);
2148             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2149             H                = _mm_setzero_pd();
2150             GMX_MM_TRANSPOSE2_PD(G,H);
2151             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2152             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2153             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq11,FF),_mm_mul_pd(vftabscale,rinv11)));
2154
2155             fscal            = felec;
2156
2157             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2158
2159             /* Update vectorial force */
2160             fix1             = _mm_macc_pd(dx11,fscal,fix1);
2161             fiy1             = _mm_macc_pd(dy11,fscal,fiy1);
2162             fiz1             = _mm_macc_pd(dz11,fscal,fiz1);
2163             
2164             fjx1             = _mm_macc_pd(dx11,fscal,fjx1);
2165             fjy1             = _mm_macc_pd(dy11,fscal,fjy1);
2166             fjz1             = _mm_macc_pd(dz11,fscal,fjz1);
2167
2168             /**************************
2169              * CALCULATE INTERACTIONS *
2170              **************************/
2171
2172             r12              = _mm_mul_pd(rsq12,rinv12);
2173
2174             /* Calculate table index by multiplying r with table scale and truncate to integer */
2175             rt               = _mm_mul_pd(r12,vftabscale);
2176             vfitab           = _mm_cvttpd_epi32(rt);
2177 #ifdef __XOP__
2178             vfeps            = _mm_frcz_pd(rt);
2179 #else
2180             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2181 #endif
2182             twovfeps         = _mm_add_pd(vfeps,vfeps);
2183             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2184
2185             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2186             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2187             F                = _mm_setzero_pd();
2188             GMX_MM_TRANSPOSE2_PD(Y,F);
2189             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2190             H                = _mm_setzero_pd();
2191             GMX_MM_TRANSPOSE2_PD(G,H);
2192             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2193             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2194             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq12,FF),_mm_mul_pd(vftabscale,rinv12)));
2195
2196             fscal            = felec;
2197
2198             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2199
2200             /* Update vectorial force */
2201             fix1             = _mm_macc_pd(dx12,fscal,fix1);
2202             fiy1             = _mm_macc_pd(dy12,fscal,fiy1);
2203             fiz1             = _mm_macc_pd(dz12,fscal,fiz1);
2204             
2205             fjx2             = _mm_macc_pd(dx12,fscal,fjx2);
2206             fjy2             = _mm_macc_pd(dy12,fscal,fjy2);
2207             fjz2             = _mm_macc_pd(dz12,fscal,fjz2);
2208
2209             /**************************
2210              * CALCULATE INTERACTIONS *
2211              **************************/
2212
2213             r13              = _mm_mul_pd(rsq13,rinv13);
2214
2215             /* Calculate table index by multiplying r with table scale and truncate to integer */
2216             rt               = _mm_mul_pd(r13,vftabscale);
2217             vfitab           = _mm_cvttpd_epi32(rt);
2218 #ifdef __XOP__
2219             vfeps            = _mm_frcz_pd(rt);
2220 #else
2221             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2222 #endif
2223             twovfeps         = _mm_add_pd(vfeps,vfeps);
2224             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2225
2226             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2227             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2228             F                = _mm_setzero_pd();
2229             GMX_MM_TRANSPOSE2_PD(Y,F);
2230             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2231             H                = _mm_setzero_pd();
2232             GMX_MM_TRANSPOSE2_PD(G,H);
2233             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2234             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2235             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq13,FF),_mm_mul_pd(vftabscale,rinv13)));
2236
2237             fscal            = felec;
2238
2239             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2240
2241             /* Update vectorial force */
2242             fix1             = _mm_macc_pd(dx13,fscal,fix1);
2243             fiy1             = _mm_macc_pd(dy13,fscal,fiy1);
2244             fiz1             = _mm_macc_pd(dz13,fscal,fiz1);
2245             
2246             fjx3             = _mm_macc_pd(dx13,fscal,fjx3);
2247             fjy3             = _mm_macc_pd(dy13,fscal,fjy3);
2248             fjz3             = _mm_macc_pd(dz13,fscal,fjz3);
2249
2250             /**************************
2251              * CALCULATE INTERACTIONS *
2252              **************************/
2253
2254             r21              = _mm_mul_pd(rsq21,rinv21);
2255
2256             /* Calculate table index by multiplying r with table scale and truncate to integer */
2257             rt               = _mm_mul_pd(r21,vftabscale);
2258             vfitab           = _mm_cvttpd_epi32(rt);
2259 #ifdef __XOP__
2260             vfeps            = _mm_frcz_pd(rt);
2261 #else
2262             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2263 #endif
2264             twovfeps         = _mm_add_pd(vfeps,vfeps);
2265             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2266
2267             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2268             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2269             F                = _mm_setzero_pd();
2270             GMX_MM_TRANSPOSE2_PD(Y,F);
2271             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2272             H                = _mm_setzero_pd();
2273             GMX_MM_TRANSPOSE2_PD(G,H);
2274             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2275             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2276             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq21,FF),_mm_mul_pd(vftabscale,rinv21)));
2277
2278             fscal            = felec;
2279
2280             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2281
2282             /* Update vectorial force */
2283             fix2             = _mm_macc_pd(dx21,fscal,fix2);
2284             fiy2             = _mm_macc_pd(dy21,fscal,fiy2);
2285             fiz2             = _mm_macc_pd(dz21,fscal,fiz2);
2286             
2287             fjx1             = _mm_macc_pd(dx21,fscal,fjx1);
2288             fjy1             = _mm_macc_pd(dy21,fscal,fjy1);
2289             fjz1             = _mm_macc_pd(dz21,fscal,fjz1);
2290
2291             /**************************
2292              * CALCULATE INTERACTIONS *
2293              **************************/
2294
2295             r22              = _mm_mul_pd(rsq22,rinv22);
2296
2297             /* Calculate table index by multiplying r with table scale and truncate to integer */
2298             rt               = _mm_mul_pd(r22,vftabscale);
2299             vfitab           = _mm_cvttpd_epi32(rt);
2300 #ifdef __XOP__
2301             vfeps            = _mm_frcz_pd(rt);
2302 #else
2303             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2304 #endif
2305             twovfeps         = _mm_add_pd(vfeps,vfeps);
2306             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2307
2308             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2309             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2310             F                = _mm_setzero_pd();
2311             GMX_MM_TRANSPOSE2_PD(Y,F);
2312             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2313             H                = _mm_setzero_pd();
2314             GMX_MM_TRANSPOSE2_PD(G,H);
2315             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2316             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2317             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq22,FF),_mm_mul_pd(vftabscale,rinv22)));
2318
2319             fscal            = felec;
2320
2321             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2322
2323             /* Update vectorial force */
2324             fix2             = _mm_macc_pd(dx22,fscal,fix2);
2325             fiy2             = _mm_macc_pd(dy22,fscal,fiy2);
2326             fiz2             = _mm_macc_pd(dz22,fscal,fiz2);
2327             
2328             fjx2             = _mm_macc_pd(dx22,fscal,fjx2);
2329             fjy2             = _mm_macc_pd(dy22,fscal,fjy2);
2330             fjz2             = _mm_macc_pd(dz22,fscal,fjz2);
2331
2332             /**************************
2333              * CALCULATE INTERACTIONS *
2334              **************************/
2335
2336             r23              = _mm_mul_pd(rsq23,rinv23);
2337
2338             /* Calculate table index by multiplying r with table scale and truncate to integer */
2339             rt               = _mm_mul_pd(r23,vftabscale);
2340             vfitab           = _mm_cvttpd_epi32(rt);
2341 #ifdef __XOP__
2342             vfeps            = _mm_frcz_pd(rt);
2343 #else
2344             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2345 #endif
2346             twovfeps         = _mm_add_pd(vfeps,vfeps);
2347             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2348
2349             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2350             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2351             F                = _mm_setzero_pd();
2352             GMX_MM_TRANSPOSE2_PD(Y,F);
2353             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2354             H                = _mm_setzero_pd();
2355             GMX_MM_TRANSPOSE2_PD(G,H);
2356             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2357             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2358             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq23,FF),_mm_mul_pd(vftabscale,rinv23)));
2359
2360             fscal            = felec;
2361
2362             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2363
2364             /* Update vectorial force */
2365             fix2             = _mm_macc_pd(dx23,fscal,fix2);
2366             fiy2             = _mm_macc_pd(dy23,fscal,fiy2);
2367             fiz2             = _mm_macc_pd(dz23,fscal,fiz2);
2368             
2369             fjx3             = _mm_macc_pd(dx23,fscal,fjx3);
2370             fjy3             = _mm_macc_pd(dy23,fscal,fjy3);
2371             fjz3             = _mm_macc_pd(dz23,fscal,fjz3);
2372
2373             /**************************
2374              * CALCULATE INTERACTIONS *
2375              **************************/
2376
2377             r31              = _mm_mul_pd(rsq31,rinv31);
2378
2379             /* Calculate table index by multiplying r with table scale and truncate to integer */
2380             rt               = _mm_mul_pd(r31,vftabscale);
2381             vfitab           = _mm_cvttpd_epi32(rt);
2382 #ifdef __XOP__
2383             vfeps            = _mm_frcz_pd(rt);
2384 #else
2385             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2386 #endif
2387             twovfeps         = _mm_add_pd(vfeps,vfeps);
2388             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2389
2390             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2391             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2392             F                = _mm_setzero_pd();
2393             GMX_MM_TRANSPOSE2_PD(Y,F);
2394             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2395             H                = _mm_setzero_pd();
2396             GMX_MM_TRANSPOSE2_PD(G,H);
2397             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2398             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2399             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq31,FF),_mm_mul_pd(vftabscale,rinv31)));
2400
2401             fscal            = felec;
2402
2403             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2404
2405             /* Update vectorial force */
2406             fix3             = _mm_macc_pd(dx31,fscal,fix3);
2407             fiy3             = _mm_macc_pd(dy31,fscal,fiy3);
2408             fiz3             = _mm_macc_pd(dz31,fscal,fiz3);
2409             
2410             fjx1             = _mm_macc_pd(dx31,fscal,fjx1);
2411             fjy1             = _mm_macc_pd(dy31,fscal,fjy1);
2412             fjz1             = _mm_macc_pd(dz31,fscal,fjz1);
2413
2414             /**************************
2415              * CALCULATE INTERACTIONS *
2416              **************************/
2417
2418             r32              = _mm_mul_pd(rsq32,rinv32);
2419
2420             /* Calculate table index by multiplying r with table scale and truncate to integer */
2421             rt               = _mm_mul_pd(r32,vftabscale);
2422             vfitab           = _mm_cvttpd_epi32(rt);
2423 #ifdef __XOP__
2424             vfeps            = _mm_frcz_pd(rt);
2425 #else
2426             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2427 #endif
2428             twovfeps         = _mm_add_pd(vfeps,vfeps);
2429             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2430
2431             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2432             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2433             F                = _mm_setzero_pd();
2434             GMX_MM_TRANSPOSE2_PD(Y,F);
2435             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2436             H                = _mm_setzero_pd();
2437             GMX_MM_TRANSPOSE2_PD(G,H);
2438             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2439             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2440             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq32,FF),_mm_mul_pd(vftabscale,rinv32)));
2441
2442             fscal            = felec;
2443
2444             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2445
2446             /* Update vectorial force */
2447             fix3             = _mm_macc_pd(dx32,fscal,fix3);
2448             fiy3             = _mm_macc_pd(dy32,fscal,fiy3);
2449             fiz3             = _mm_macc_pd(dz32,fscal,fiz3);
2450             
2451             fjx2             = _mm_macc_pd(dx32,fscal,fjx2);
2452             fjy2             = _mm_macc_pd(dy32,fscal,fjy2);
2453             fjz2             = _mm_macc_pd(dz32,fscal,fjz2);
2454
2455             /**************************
2456              * CALCULATE INTERACTIONS *
2457              **************************/
2458
2459             r33              = _mm_mul_pd(rsq33,rinv33);
2460
2461             /* Calculate table index by multiplying r with table scale and truncate to integer */
2462             rt               = _mm_mul_pd(r33,vftabscale);
2463             vfitab           = _mm_cvttpd_epi32(rt);
2464 #ifdef __XOP__
2465             vfeps            = _mm_frcz_pd(rt);
2466 #else
2467             vfeps            = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
2468 #endif
2469             twovfeps         = _mm_add_pd(vfeps,vfeps);
2470             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2471
2472             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2473             Y                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
2474             F                = _mm_setzero_pd();
2475             GMX_MM_TRANSPOSE2_PD(Y,F);
2476             G                = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
2477             H                = _mm_setzero_pd();
2478             GMX_MM_TRANSPOSE2_PD(G,H);
2479             Fp               = _mm_macc_pd(vfeps,_mm_macc_pd(vfeps,H,G),F);
2480             FF               = _mm_macc_pd(_mm_macc_pd(twovfeps,H,G),vfeps,Fp);
2481             felec            = _mm_xor_pd(signbit,_mm_mul_pd(_mm_mul_pd(qq33,FF),_mm_mul_pd(vftabscale,rinv33)));
2482
2483             fscal            = felec;
2484
2485             fscal            = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
2486
2487             /* Update vectorial force */
2488             fix3             = _mm_macc_pd(dx33,fscal,fix3);
2489             fiy3             = _mm_macc_pd(dy33,fscal,fiy3);
2490             fiz3             = _mm_macc_pd(dz33,fscal,fiz3);
2491             
2492             fjx3             = _mm_macc_pd(dx33,fscal,fjx3);
2493             fjy3             = _mm_macc_pd(dy33,fscal,fjy3);
2494             fjz3             = _mm_macc_pd(dz33,fscal,fjz3);
2495
2496             gmx_mm_decrement_4rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2497
2498             /* Inner loop uses 432 flops */
2499         }
2500
2501         /* End of innermost loop */
2502
2503         gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2504                                               f+i_coord_offset,fshift+i_shift_offset);
2505
2506         /* Increment number of inner iterations */
2507         inneriter                  += j_index_end - j_index_start;
2508
2509         /* Outer loop uses 24 flops */
2510     }
2511
2512     /* Increment number of outer iterations */
2513     outeriter        += nri;
2514
2515     /* Update outer/inner flops */
2516
2517     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*432);
2518 }