Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_avx_128_fma_single.c
1 /*
2  * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_128_fma_single.h"
34 #include "kernelutil_x86_avx_128_fma_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_single
38  * Electrostatics interaction: CubicSplineTable
39  * VdW interaction:            CubicSplineTable
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
64     real             rcutoff_scalar;
65     real             *shiftvec,*fshift,*x,*f;
66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
67     real             scratch[4*DIM];
68     __m128           fscal,rcutoff,rcutoff2,jidxall;
69     int              vdwioffset0;
70     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
71     int              vdwioffset1;
72     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
73     int              vdwioffset2;
74     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75     int              vdwioffset3;
76     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
77     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
78     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
79     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
80     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
81     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
82     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
83     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
84     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
85     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
86     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
89     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
92     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
93     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
94     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
95     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
96     real             *charge;
97     int              nvdwtype;
98     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
99     int              *vdwtype;
100     real             *vdwparam;
101     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
102     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
103     __m128i          vfitab;
104     __m128i          ifour       = _mm_set1_epi32(4);
105     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
106     real             *vftab;
107     __m128           dummy_mask,cutoff_mask;
108     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
109     __m128           one     = _mm_set1_ps(1.0);
110     __m128           two     = _mm_set1_ps(2.0);
111     x                = xx[0];
112     f                = ff[0];
113
114     nri              = nlist->nri;
115     iinr             = nlist->iinr;
116     jindex           = nlist->jindex;
117     jjnr             = nlist->jjnr;
118     shiftidx         = nlist->shift;
119     gid              = nlist->gid;
120     shiftvec         = fr->shift_vec[0];
121     fshift           = fr->fshift[0];
122     facel            = _mm_set1_ps(fr->epsfac);
123     charge           = mdatoms->chargeA;
124     nvdwtype         = fr->ntype;
125     vdwparam         = fr->nbfp;
126     vdwtype          = mdatoms->typeA;
127
128     vftab            = kernel_data->table_elec_vdw->data;
129     vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
130
131     /* Setup water-specific parameters */
132     inr              = nlist->iinr[0];
133     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
134     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
135     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
136     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
137
138     jq1              = _mm_set1_ps(charge[inr+1]);
139     jq2              = _mm_set1_ps(charge[inr+2]);
140     jq3              = _mm_set1_ps(charge[inr+3]);
141     vdwjidx0A        = 2*vdwtype[inr+0];
142     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
143     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
144     qq11             = _mm_mul_ps(iq1,jq1);
145     qq12             = _mm_mul_ps(iq1,jq2);
146     qq13             = _mm_mul_ps(iq1,jq3);
147     qq21             = _mm_mul_ps(iq2,jq1);
148     qq22             = _mm_mul_ps(iq2,jq2);
149     qq23             = _mm_mul_ps(iq2,jq3);
150     qq31             = _mm_mul_ps(iq3,jq1);
151     qq32             = _mm_mul_ps(iq3,jq2);
152     qq33             = _mm_mul_ps(iq3,jq3);
153
154     /* Avoid stupid compiler warnings */
155     jnrA = jnrB = jnrC = jnrD = 0;
156     j_coord_offsetA = 0;
157     j_coord_offsetB = 0;
158     j_coord_offsetC = 0;
159     j_coord_offsetD = 0;
160
161     outeriter        = 0;
162     inneriter        = 0;
163
164     for(iidx=0;iidx<4*DIM;iidx++)
165     {
166         scratch[iidx] = 0.0;
167     }
168
169     /* Start outer loop over neighborlists */
170     for(iidx=0; iidx<nri; iidx++)
171     {
172         /* Load shift vector for this list */
173         i_shift_offset   = DIM*shiftidx[iidx];
174
175         /* Load limits for loop over neighbors */
176         j_index_start    = jindex[iidx];
177         j_index_end      = jindex[iidx+1];
178
179         /* Get outer coordinate index */
180         inr              = iinr[iidx];
181         i_coord_offset   = DIM*inr;
182
183         /* Load i particle coords and add shift vector */
184         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
185                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
186
187         fix0             = _mm_setzero_ps();
188         fiy0             = _mm_setzero_ps();
189         fiz0             = _mm_setzero_ps();
190         fix1             = _mm_setzero_ps();
191         fiy1             = _mm_setzero_ps();
192         fiz1             = _mm_setzero_ps();
193         fix2             = _mm_setzero_ps();
194         fiy2             = _mm_setzero_ps();
195         fiz2             = _mm_setzero_ps();
196         fix3             = _mm_setzero_ps();
197         fiy3             = _mm_setzero_ps();
198         fiz3             = _mm_setzero_ps();
199
200         /* Reset potential sums */
201         velecsum         = _mm_setzero_ps();
202         vvdwsum          = _mm_setzero_ps();
203
204         /* Start inner kernel loop */
205         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
206         {
207
208             /* Get j neighbor index, and coordinate index */
209             jnrA             = jjnr[jidx];
210             jnrB             = jjnr[jidx+1];
211             jnrC             = jjnr[jidx+2];
212             jnrD             = jjnr[jidx+3];
213             j_coord_offsetA  = DIM*jnrA;
214             j_coord_offsetB  = DIM*jnrB;
215             j_coord_offsetC  = DIM*jnrC;
216             j_coord_offsetD  = DIM*jnrD;
217
218             /* load j atom coordinates */
219             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
220                                               x+j_coord_offsetC,x+j_coord_offsetD,
221                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
222                                               &jy2,&jz2,&jx3,&jy3,&jz3);
223
224             /* Calculate displacement vector */
225             dx00             = _mm_sub_ps(ix0,jx0);
226             dy00             = _mm_sub_ps(iy0,jy0);
227             dz00             = _mm_sub_ps(iz0,jz0);
228             dx11             = _mm_sub_ps(ix1,jx1);
229             dy11             = _mm_sub_ps(iy1,jy1);
230             dz11             = _mm_sub_ps(iz1,jz1);
231             dx12             = _mm_sub_ps(ix1,jx2);
232             dy12             = _mm_sub_ps(iy1,jy2);
233             dz12             = _mm_sub_ps(iz1,jz2);
234             dx13             = _mm_sub_ps(ix1,jx3);
235             dy13             = _mm_sub_ps(iy1,jy3);
236             dz13             = _mm_sub_ps(iz1,jz3);
237             dx21             = _mm_sub_ps(ix2,jx1);
238             dy21             = _mm_sub_ps(iy2,jy1);
239             dz21             = _mm_sub_ps(iz2,jz1);
240             dx22             = _mm_sub_ps(ix2,jx2);
241             dy22             = _mm_sub_ps(iy2,jy2);
242             dz22             = _mm_sub_ps(iz2,jz2);
243             dx23             = _mm_sub_ps(ix2,jx3);
244             dy23             = _mm_sub_ps(iy2,jy3);
245             dz23             = _mm_sub_ps(iz2,jz3);
246             dx31             = _mm_sub_ps(ix3,jx1);
247             dy31             = _mm_sub_ps(iy3,jy1);
248             dz31             = _mm_sub_ps(iz3,jz1);
249             dx32             = _mm_sub_ps(ix3,jx2);
250             dy32             = _mm_sub_ps(iy3,jy2);
251             dz32             = _mm_sub_ps(iz3,jz2);
252             dx33             = _mm_sub_ps(ix3,jx3);
253             dy33             = _mm_sub_ps(iy3,jy3);
254             dz33             = _mm_sub_ps(iz3,jz3);
255
256             /* Calculate squared distance and things based on it */
257             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
258             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
259             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
260             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
261             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
262             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
263             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
264             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
265             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
266             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
267
268             rinv00           = gmx_mm_invsqrt_ps(rsq00);
269             rinv11           = gmx_mm_invsqrt_ps(rsq11);
270             rinv12           = gmx_mm_invsqrt_ps(rsq12);
271             rinv13           = gmx_mm_invsqrt_ps(rsq13);
272             rinv21           = gmx_mm_invsqrt_ps(rsq21);
273             rinv22           = gmx_mm_invsqrt_ps(rsq22);
274             rinv23           = gmx_mm_invsqrt_ps(rsq23);
275             rinv31           = gmx_mm_invsqrt_ps(rsq31);
276             rinv32           = gmx_mm_invsqrt_ps(rsq32);
277             rinv33           = gmx_mm_invsqrt_ps(rsq33);
278
279             fjx0             = _mm_setzero_ps();
280             fjy0             = _mm_setzero_ps();
281             fjz0             = _mm_setzero_ps();
282             fjx1             = _mm_setzero_ps();
283             fjy1             = _mm_setzero_ps();
284             fjz1             = _mm_setzero_ps();
285             fjx2             = _mm_setzero_ps();
286             fjy2             = _mm_setzero_ps();
287             fjz2             = _mm_setzero_ps();
288             fjx3             = _mm_setzero_ps();
289             fjy3             = _mm_setzero_ps();
290             fjz3             = _mm_setzero_ps();
291
292             /**************************
293              * CALCULATE INTERACTIONS *
294              **************************/
295
296             r00              = _mm_mul_ps(rsq00,rinv00);
297
298             /* Calculate table index by multiplying r with table scale and truncate to integer */
299             rt               = _mm_mul_ps(r00,vftabscale);
300             vfitab           = _mm_cvttps_epi32(rt);
301 #ifdef __XOP__
302             vfeps            = _mm_frcz_ps(rt);
303 #else
304             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
305 #endif
306             twovfeps         = _mm_add_ps(vfeps,vfeps);
307             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
308
309             /* CUBIC SPLINE TABLE DISPERSION */
310             vfitab           = _mm_add_epi32(vfitab,ifour);
311             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
312             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
313             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
314             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
315             _MM_TRANSPOSE4_PS(Y,F,G,H);
316             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
317             VV               = _mm_macc_ps(vfeps,Fp,Y);
318             vvdw6            = _mm_mul_ps(c6_00,VV);
319             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
320             fvdw6            = _mm_mul_ps(c6_00,FF);
321
322             /* CUBIC SPLINE TABLE REPULSION */
323             vfitab           = _mm_add_epi32(vfitab,ifour);
324             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
325             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
326             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
327             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
328             _MM_TRANSPOSE4_PS(Y,F,G,H);
329             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
330             VV               = _mm_macc_ps(vfeps,Fp,Y);
331             vvdw12           = _mm_mul_ps(c12_00,VV);
332             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
333             fvdw12           = _mm_mul_ps(c12_00,FF);
334             vvdw             = _mm_add_ps(vvdw12,vvdw6);
335             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
336
337             /* Update potential sum for this i atom from the interaction with this j atom. */
338             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
339
340             fscal            = fvdw;
341
342              /* Update vectorial force */
343             fix0             = _mm_macc_ps(dx00,fscal,fix0);
344             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
345             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
346
347             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
348             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
349             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
350
351             /**************************
352              * CALCULATE INTERACTIONS *
353              **************************/
354
355             r11              = _mm_mul_ps(rsq11,rinv11);
356
357             /* Calculate table index by multiplying r with table scale and truncate to integer */
358             rt               = _mm_mul_ps(r11,vftabscale);
359             vfitab           = _mm_cvttps_epi32(rt);
360 #ifdef __XOP__
361             vfeps            = _mm_frcz_ps(rt);
362 #else
363             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
364 #endif
365             twovfeps         = _mm_add_ps(vfeps,vfeps);
366             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
367
368             /* CUBIC SPLINE TABLE ELECTROSTATICS */
369             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
370             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
371             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
372             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
373             _MM_TRANSPOSE4_PS(Y,F,G,H);
374             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
375             VV               = _mm_macc_ps(vfeps,Fp,Y);
376             velec            = _mm_mul_ps(qq11,VV);
377             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
378             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
379
380             /* Update potential sum for this i atom from the interaction with this j atom. */
381             velecsum         = _mm_add_ps(velecsum,velec);
382
383             fscal            = felec;
384
385              /* Update vectorial force */
386             fix1             = _mm_macc_ps(dx11,fscal,fix1);
387             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
388             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
389
390             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
391             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
392             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
393
394             /**************************
395              * CALCULATE INTERACTIONS *
396              **************************/
397
398             r12              = _mm_mul_ps(rsq12,rinv12);
399
400             /* Calculate table index by multiplying r with table scale and truncate to integer */
401             rt               = _mm_mul_ps(r12,vftabscale);
402             vfitab           = _mm_cvttps_epi32(rt);
403 #ifdef __XOP__
404             vfeps            = _mm_frcz_ps(rt);
405 #else
406             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
407 #endif
408             twovfeps         = _mm_add_ps(vfeps,vfeps);
409             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
410
411             /* CUBIC SPLINE TABLE ELECTROSTATICS */
412             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
413             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
414             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
415             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
416             _MM_TRANSPOSE4_PS(Y,F,G,H);
417             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
418             VV               = _mm_macc_ps(vfeps,Fp,Y);
419             velec            = _mm_mul_ps(qq12,VV);
420             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
421             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
422
423             /* Update potential sum for this i atom from the interaction with this j atom. */
424             velecsum         = _mm_add_ps(velecsum,velec);
425
426             fscal            = felec;
427
428              /* Update vectorial force */
429             fix1             = _mm_macc_ps(dx12,fscal,fix1);
430             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
431             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
432
433             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
434             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
435             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
436
437             /**************************
438              * CALCULATE INTERACTIONS *
439              **************************/
440
441             r13              = _mm_mul_ps(rsq13,rinv13);
442
443             /* Calculate table index by multiplying r with table scale and truncate to integer */
444             rt               = _mm_mul_ps(r13,vftabscale);
445             vfitab           = _mm_cvttps_epi32(rt);
446 #ifdef __XOP__
447             vfeps            = _mm_frcz_ps(rt);
448 #else
449             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
450 #endif
451             twovfeps         = _mm_add_ps(vfeps,vfeps);
452             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
453
454             /* CUBIC SPLINE TABLE ELECTROSTATICS */
455             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
456             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
457             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
458             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
459             _MM_TRANSPOSE4_PS(Y,F,G,H);
460             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
461             VV               = _mm_macc_ps(vfeps,Fp,Y);
462             velec            = _mm_mul_ps(qq13,VV);
463             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
464             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
465
466             /* Update potential sum for this i atom from the interaction with this j atom. */
467             velecsum         = _mm_add_ps(velecsum,velec);
468
469             fscal            = felec;
470
471              /* Update vectorial force */
472             fix1             = _mm_macc_ps(dx13,fscal,fix1);
473             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
474             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
475
476             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
477             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
478             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
479
480             /**************************
481              * CALCULATE INTERACTIONS *
482              **************************/
483
484             r21              = _mm_mul_ps(rsq21,rinv21);
485
486             /* Calculate table index by multiplying r with table scale and truncate to integer */
487             rt               = _mm_mul_ps(r21,vftabscale);
488             vfitab           = _mm_cvttps_epi32(rt);
489 #ifdef __XOP__
490             vfeps            = _mm_frcz_ps(rt);
491 #else
492             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
493 #endif
494             twovfeps         = _mm_add_ps(vfeps,vfeps);
495             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
496
497             /* CUBIC SPLINE TABLE ELECTROSTATICS */
498             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
499             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
500             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
501             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
502             _MM_TRANSPOSE4_PS(Y,F,G,H);
503             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
504             VV               = _mm_macc_ps(vfeps,Fp,Y);
505             velec            = _mm_mul_ps(qq21,VV);
506             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
507             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
508
509             /* Update potential sum for this i atom from the interaction with this j atom. */
510             velecsum         = _mm_add_ps(velecsum,velec);
511
512             fscal            = felec;
513
514              /* Update vectorial force */
515             fix2             = _mm_macc_ps(dx21,fscal,fix2);
516             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
517             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
518
519             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
520             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
521             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
522
523             /**************************
524              * CALCULATE INTERACTIONS *
525              **************************/
526
527             r22              = _mm_mul_ps(rsq22,rinv22);
528
529             /* Calculate table index by multiplying r with table scale and truncate to integer */
530             rt               = _mm_mul_ps(r22,vftabscale);
531             vfitab           = _mm_cvttps_epi32(rt);
532 #ifdef __XOP__
533             vfeps            = _mm_frcz_ps(rt);
534 #else
535             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
536 #endif
537             twovfeps         = _mm_add_ps(vfeps,vfeps);
538             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
539
540             /* CUBIC SPLINE TABLE ELECTROSTATICS */
541             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
542             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
543             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
544             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
545             _MM_TRANSPOSE4_PS(Y,F,G,H);
546             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
547             VV               = _mm_macc_ps(vfeps,Fp,Y);
548             velec            = _mm_mul_ps(qq22,VV);
549             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
550             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
551
552             /* Update potential sum for this i atom from the interaction with this j atom. */
553             velecsum         = _mm_add_ps(velecsum,velec);
554
555             fscal            = felec;
556
557              /* Update vectorial force */
558             fix2             = _mm_macc_ps(dx22,fscal,fix2);
559             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
560             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
561
562             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
563             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
564             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
565
566             /**************************
567              * CALCULATE INTERACTIONS *
568              **************************/
569
570             r23              = _mm_mul_ps(rsq23,rinv23);
571
572             /* Calculate table index by multiplying r with table scale and truncate to integer */
573             rt               = _mm_mul_ps(r23,vftabscale);
574             vfitab           = _mm_cvttps_epi32(rt);
575 #ifdef __XOP__
576             vfeps            = _mm_frcz_ps(rt);
577 #else
578             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
579 #endif
580             twovfeps         = _mm_add_ps(vfeps,vfeps);
581             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
582
583             /* CUBIC SPLINE TABLE ELECTROSTATICS */
584             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
585             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
586             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
587             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
588             _MM_TRANSPOSE4_PS(Y,F,G,H);
589             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
590             VV               = _mm_macc_ps(vfeps,Fp,Y);
591             velec            = _mm_mul_ps(qq23,VV);
592             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
593             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
594
595             /* Update potential sum for this i atom from the interaction with this j atom. */
596             velecsum         = _mm_add_ps(velecsum,velec);
597
598             fscal            = felec;
599
600              /* Update vectorial force */
601             fix2             = _mm_macc_ps(dx23,fscal,fix2);
602             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
603             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
604
605             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
606             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
607             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
608
609             /**************************
610              * CALCULATE INTERACTIONS *
611              **************************/
612
613             r31              = _mm_mul_ps(rsq31,rinv31);
614
615             /* Calculate table index by multiplying r with table scale and truncate to integer */
616             rt               = _mm_mul_ps(r31,vftabscale);
617             vfitab           = _mm_cvttps_epi32(rt);
618 #ifdef __XOP__
619             vfeps            = _mm_frcz_ps(rt);
620 #else
621             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
622 #endif
623             twovfeps         = _mm_add_ps(vfeps,vfeps);
624             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
625
626             /* CUBIC SPLINE TABLE ELECTROSTATICS */
627             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
628             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
629             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
630             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
631             _MM_TRANSPOSE4_PS(Y,F,G,H);
632             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
633             VV               = _mm_macc_ps(vfeps,Fp,Y);
634             velec            = _mm_mul_ps(qq31,VV);
635             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
636             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
637
638             /* Update potential sum for this i atom from the interaction with this j atom. */
639             velecsum         = _mm_add_ps(velecsum,velec);
640
641             fscal            = felec;
642
643              /* Update vectorial force */
644             fix3             = _mm_macc_ps(dx31,fscal,fix3);
645             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
646             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
647
648             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
649             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
650             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
651
652             /**************************
653              * CALCULATE INTERACTIONS *
654              **************************/
655
656             r32              = _mm_mul_ps(rsq32,rinv32);
657
658             /* Calculate table index by multiplying r with table scale and truncate to integer */
659             rt               = _mm_mul_ps(r32,vftabscale);
660             vfitab           = _mm_cvttps_epi32(rt);
661 #ifdef __XOP__
662             vfeps            = _mm_frcz_ps(rt);
663 #else
664             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
665 #endif
666             twovfeps         = _mm_add_ps(vfeps,vfeps);
667             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
668
669             /* CUBIC SPLINE TABLE ELECTROSTATICS */
670             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
671             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
672             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
673             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
674             _MM_TRANSPOSE4_PS(Y,F,G,H);
675             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
676             VV               = _mm_macc_ps(vfeps,Fp,Y);
677             velec            = _mm_mul_ps(qq32,VV);
678             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
679             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
680
681             /* Update potential sum for this i atom from the interaction with this j atom. */
682             velecsum         = _mm_add_ps(velecsum,velec);
683
684             fscal            = felec;
685
686              /* Update vectorial force */
687             fix3             = _mm_macc_ps(dx32,fscal,fix3);
688             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
689             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
690
691             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
692             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
693             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
694
695             /**************************
696              * CALCULATE INTERACTIONS *
697              **************************/
698
699             r33              = _mm_mul_ps(rsq33,rinv33);
700
701             /* Calculate table index by multiplying r with table scale and truncate to integer */
702             rt               = _mm_mul_ps(r33,vftabscale);
703             vfitab           = _mm_cvttps_epi32(rt);
704 #ifdef __XOP__
705             vfeps            = _mm_frcz_ps(rt);
706 #else
707             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
708 #endif
709             twovfeps         = _mm_add_ps(vfeps,vfeps);
710             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
711
712             /* CUBIC SPLINE TABLE ELECTROSTATICS */
713             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
714             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
715             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
716             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
717             _MM_TRANSPOSE4_PS(Y,F,G,H);
718             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
719             VV               = _mm_macc_ps(vfeps,Fp,Y);
720             velec            = _mm_mul_ps(qq33,VV);
721             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
722             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
723
724             /* Update potential sum for this i atom from the interaction with this j atom. */
725             velecsum         = _mm_add_ps(velecsum,velec);
726
727             fscal            = felec;
728
729              /* Update vectorial force */
730             fix3             = _mm_macc_ps(dx33,fscal,fix3);
731             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
732             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
733
734             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
735             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
736             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
737
738             fjptrA             = f+j_coord_offsetA;
739             fjptrB             = f+j_coord_offsetB;
740             fjptrC             = f+j_coord_offsetC;
741             fjptrD             = f+j_coord_offsetD;
742
743             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
744                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
745                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
746
747             /* Inner loop uses 476 flops */
748         }
749
750         if(jidx<j_index_end)
751         {
752
753             /* Get j neighbor index, and coordinate index */
754             jnrlistA         = jjnr[jidx];
755             jnrlistB         = jjnr[jidx+1];
756             jnrlistC         = jjnr[jidx+2];
757             jnrlistD         = jjnr[jidx+3];
758             /* Sign of each element will be negative for non-real atoms.
759              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
760              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
761              */
762             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
763             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
764             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
765             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
766             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
767             j_coord_offsetA  = DIM*jnrA;
768             j_coord_offsetB  = DIM*jnrB;
769             j_coord_offsetC  = DIM*jnrC;
770             j_coord_offsetD  = DIM*jnrD;
771
772             /* load j atom coordinates */
773             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
774                                               x+j_coord_offsetC,x+j_coord_offsetD,
775                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
776                                               &jy2,&jz2,&jx3,&jy3,&jz3);
777
778             /* Calculate displacement vector */
779             dx00             = _mm_sub_ps(ix0,jx0);
780             dy00             = _mm_sub_ps(iy0,jy0);
781             dz00             = _mm_sub_ps(iz0,jz0);
782             dx11             = _mm_sub_ps(ix1,jx1);
783             dy11             = _mm_sub_ps(iy1,jy1);
784             dz11             = _mm_sub_ps(iz1,jz1);
785             dx12             = _mm_sub_ps(ix1,jx2);
786             dy12             = _mm_sub_ps(iy1,jy2);
787             dz12             = _mm_sub_ps(iz1,jz2);
788             dx13             = _mm_sub_ps(ix1,jx3);
789             dy13             = _mm_sub_ps(iy1,jy3);
790             dz13             = _mm_sub_ps(iz1,jz3);
791             dx21             = _mm_sub_ps(ix2,jx1);
792             dy21             = _mm_sub_ps(iy2,jy1);
793             dz21             = _mm_sub_ps(iz2,jz1);
794             dx22             = _mm_sub_ps(ix2,jx2);
795             dy22             = _mm_sub_ps(iy2,jy2);
796             dz22             = _mm_sub_ps(iz2,jz2);
797             dx23             = _mm_sub_ps(ix2,jx3);
798             dy23             = _mm_sub_ps(iy2,jy3);
799             dz23             = _mm_sub_ps(iz2,jz3);
800             dx31             = _mm_sub_ps(ix3,jx1);
801             dy31             = _mm_sub_ps(iy3,jy1);
802             dz31             = _mm_sub_ps(iz3,jz1);
803             dx32             = _mm_sub_ps(ix3,jx2);
804             dy32             = _mm_sub_ps(iy3,jy2);
805             dz32             = _mm_sub_ps(iz3,jz2);
806             dx33             = _mm_sub_ps(ix3,jx3);
807             dy33             = _mm_sub_ps(iy3,jy3);
808             dz33             = _mm_sub_ps(iz3,jz3);
809
810             /* Calculate squared distance and things based on it */
811             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
812             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
813             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
814             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
815             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
816             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
817             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
818             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
819             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
820             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
821
822             rinv00           = gmx_mm_invsqrt_ps(rsq00);
823             rinv11           = gmx_mm_invsqrt_ps(rsq11);
824             rinv12           = gmx_mm_invsqrt_ps(rsq12);
825             rinv13           = gmx_mm_invsqrt_ps(rsq13);
826             rinv21           = gmx_mm_invsqrt_ps(rsq21);
827             rinv22           = gmx_mm_invsqrt_ps(rsq22);
828             rinv23           = gmx_mm_invsqrt_ps(rsq23);
829             rinv31           = gmx_mm_invsqrt_ps(rsq31);
830             rinv32           = gmx_mm_invsqrt_ps(rsq32);
831             rinv33           = gmx_mm_invsqrt_ps(rsq33);
832
833             fjx0             = _mm_setzero_ps();
834             fjy0             = _mm_setzero_ps();
835             fjz0             = _mm_setzero_ps();
836             fjx1             = _mm_setzero_ps();
837             fjy1             = _mm_setzero_ps();
838             fjz1             = _mm_setzero_ps();
839             fjx2             = _mm_setzero_ps();
840             fjy2             = _mm_setzero_ps();
841             fjz2             = _mm_setzero_ps();
842             fjx3             = _mm_setzero_ps();
843             fjy3             = _mm_setzero_ps();
844             fjz3             = _mm_setzero_ps();
845
846             /**************************
847              * CALCULATE INTERACTIONS *
848              **************************/
849
850             r00              = _mm_mul_ps(rsq00,rinv00);
851             r00              = _mm_andnot_ps(dummy_mask,r00);
852
853             /* Calculate table index by multiplying r with table scale and truncate to integer */
854             rt               = _mm_mul_ps(r00,vftabscale);
855             vfitab           = _mm_cvttps_epi32(rt);
856 #ifdef __XOP__
857             vfeps            = _mm_frcz_ps(rt);
858 #else
859             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
860 #endif
861             twovfeps         = _mm_add_ps(vfeps,vfeps);
862             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
863
864             /* CUBIC SPLINE TABLE DISPERSION */
865             vfitab           = _mm_add_epi32(vfitab,ifour);
866             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
867             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
868             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
869             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
870             _MM_TRANSPOSE4_PS(Y,F,G,H);
871             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
872             VV               = _mm_macc_ps(vfeps,Fp,Y);
873             vvdw6            = _mm_mul_ps(c6_00,VV);
874             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
875             fvdw6            = _mm_mul_ps(c6_00,FF);
876
877             /* CUBIC SPLINE TABLE REPULSION */
878             vfitab           = _mm_add_epi32(vfitab,ifour);
879             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
880             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
881             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
882             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
883             _MM_TRANSPOSE4_PS(Y,F,G,H);
884             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
885             VV               = _mm_macc_ps(vfeps,Fp,Y);
886             vvdw12           = _mm_mul_ps(c12_00,VV);
887             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
888             fvdw12           = _mm_mul_ps(c12_00,FF);
889             vvdw             = _mm_add_ps(vvdw12,vvdw6);
890             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
891
892             /* Update potential sum for this i atom from the interaction with this j atom. */
893             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
894             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
895
896             fscal            = fvdw;
897
898             fscal            = _mm_andnot_ps(dummy_mask,fscal);
899
900              /* Update vectorial force */
901             fix0             = _mm_macc_ps(dx00,fscal,fix0);
902             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
903             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
904
905             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
906             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
907             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
908
909             /**************************
910              * CALCULATE INTERACTIONS *
911              **************************/
912
913             r11              = _mm_mul_ps(rsq11,rinv11);
914             r11              = _mm_andnot_ps(dummy_mask,r11);
915
916             /* Calculate table index by multiplying r with table scale and truncate to integer */
917             rt               = _mm_mul_ps(r11,vftabscale);
918             vfitab           = _mm_cvttps_epi32(rt);
919 #ifdef __XOP__
920             vfeps            = _mm_frcz_ps(rt);
921 #else
922             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
923 #endif
924             twovfeps         = _mm_add_ps(vfeps,vfeps);
925             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
926
927             /* CUBIC SPLINE TABLE ELECTROSTATICS */
928             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
929             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
930             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
931             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
932             _MM_TRANSPOSE4_PS(Y,F,G,H);
933             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
934             VV               = _mm_macc_ps(vfeps,Fp,Y);
935             velec            = _mm_mul_ps(qq11,VV);
936             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
937             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
938
939             /* Update potential sum for this i atom from the interaction with this j atom. */
940             velec            = _mm_andnot_ps(dummy_mask,velec);
941             velecsum         = _mm_add_ps(velecsum,velec);
942
943             fscal            = felec;
944
945             fscal            = _mm_andnot_ps(dummy_mask,fscal);
946
947              /* Update vectorial force */
948             fix1             = _mm_macc_ps(dx11,fscal,fix1);
949             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
950             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
951
952             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
953             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
954             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
955
956             /**************************
957              * CALCULATE INTERACTIONS *
958              **************************/
959
960             r12              = _mm_mul_ps(rsq12,rinv12);
961             r12              = _mm_andnot_ps(dummy_mask,r12);
962
963             /* Calculate table index by multiplying r with table scale and truncate to integer */
964             rt               = _mm_mul_ps(r12,vftabscale);
965             vfitab           = _mm_cvttps_epi32(rt);
966 #ifdef __XOP__
967             vfeps            = _mm_frcz_ps(rt);
968 #else
969             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
970 #endif
971             twovfeps         = _mm_add_ps(vfeps,vfeps);
972             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
973
974             /* CUBIC SPLINE TABLE ELECTROSTATICS */
975             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
976             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
977             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
978             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
979             _MM_TRANSPOSE4_PS(Y,F,G,H);
980             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
981             VV               = _mm_macc_ps(vfeps,Fp,Y);
982             velec            = _mm_mul_ps(qq12,VV);
983             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
984             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
985
986             /* Update potential sum for this i atom from the interaction with this j atom. */
987             velec            = _mm_andnot_ps(dummy_mask,velec);
988             velecsum         = _mm_add_ps(velecsum,velec);
989
990             fscal            = felec;
991
992             fscal            = _mm_andnot_ps(dummy_mask,fscal);
993
994              /* Update vectorial force */
995             fix1             = _mm_macc_ps(dx12,fscal,fix1);
996             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
997             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
998
999             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1000             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1001             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1002
1003             /**************************
1004              * CALCULATE INTERACTIONS *
1005              **************************/
1006
1007             r13              = _mm_mul_ps(rsq13,rinv13);
1008             r13              = _mm_andnot_ps(dummy_mask,r13);
1009
1010             /* Calculate table index by multiplying r with table scale and truncate to integer */
1011             rt               = _mm_mul_ps(r13,vftabscale);
1012             vfitab           = _mm_cvttps_epi32(rt);
1013 #ifdef __XOP__
1014             vfeps            = _mm_frcz_ps(rt);
1015 #else
1016             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1017 #endif
1018             twovfeps         = _mm_add_ps(vfeps,vfeps);
1019             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1020
1021             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1022             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1023             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1024             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1025             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1026             _MM_TRANSPOSE4_PS(Y,F,G,H);
1027             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1028             VV               = _mm_macc_ps(vfeps,Fp,Y);
1029             velec            = _mm_mul_ps(qq13,VV);
1030             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1031             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1032
1033             /* Update potential sum for this i atom from the interaction with this j atom. */
1034             velec            = _mm_andnot_ps(dummy_mask,velec);
1035             velecsum         = _mm_add_ps(velecsum,velec);
1036
1037             fscal            = felec;
1038
1039             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1040
1041              /* Update vectorial force */
1042             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1043             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1044             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1045
1046             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1047             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1048             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1049
1050             /**************************
1051              * CALCULATE INTERACTIONS *
1052              **************************/
1053
1054             r21              = _mm_mul_ps(rsq21,rinv21);
1055             r21              = _mm_andnot_ps(dummy_mask,r21);
1056
1057             /* Calculate table index by multiplying r with table scale and truncate to integer */
1058             rt               = _mm_mul_ps(r21,vftabscale);
1059             vfitab           = _mm_cvttps_epi32(rt);
1060 #ifdef __XOP__
1061             vfeps            = _mm_frcz_ps(rt);
1062 #else
1063             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1064 #endif
1065             twovfeps         = _mm_add_ps(vfeps,vfeps);
1066             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1067
1068             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1069             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1070             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1071             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1072             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1073             _MM_TRANSPOSE4_PS(Y,F,G,H);
1074             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1075             VV               = _mm_macc_ps(vfeps,Fp,Y);
1076             velec            = _mm_mul_ps(qq21,VV);
1077             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1078             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1079
1080             /* Update potential sum for this i atom from the interaction with this j atom. */
1081             velec            = _mm_andnot_ps(dummy_mask,velec);
1082             velecsum         = _mm_add_ps(velecsum,velec);
1083
1084             fscal            = felec;
1085
1086             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1087
1088              /* Update vectorial force */
1089             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1090             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1091             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1092
1093             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1094             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1095             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1096
1097             /**************************
1098              * CALCULATE INTERACTIONS *
1099              **************************/
1100
1101             r22              = _mm_mul_ps(rsq22,rinv22);
1102             r22              = _mm_andnot_ps(dummy_mask,r22);
1103
1104             /* Calculate table index by multiplying r with table scale and truncate to integer */
1105             rt               = _mm_mul_ps(r22,vftabscale);
1106             vfitab           = _mm_cvttps_epi32(rt);
1107 #ifdef __XOP__
1108             vfeps            = _mm_frcz_ps(rt);
1109 #else
1110             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1111 #endif
1112             twovfeps         = _mm_add_ps(vfeps,vfeps);
1113             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1114
1115             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1116             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1117             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1118             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1119             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1120             _MM_TRANSPOSE4_PS(Y,F,G,H);
1121             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1122             VV               = _mm_macc_ps(vfeps,Fp,Y);
1123             velec            = _mm_mul_ps(qq22,VV);
1124             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1125             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1126
1127             /* Update potential sum for this i atom from the interaction with this j atom. */
1128             velec            = _mm_andnot_ps(dummy_mask,velec);
1129             velecsum         = _mm_add_ps(velecsum,velec);
1130
1131             fscal            = felec;
1132
1133             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1134
1135              /* Update vectorial force */
1136             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1137             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1138             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1139
1140             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1141             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1142             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1143
1144             /**************************
1145              * CALCULATE INTERACTIONS *
1146              **************************/
1147
1148             r23              = _mm_mul_ps(rsq23,rinv23);
1149             r23              = _mm_andnot_ps(dummy_mask,r23);
1150
1151             /* Calculate table index by multiplying r with table scale and truncate to integer */
1152             rt               = _mm_mul_ps(r23,vftabscale);
1153             vfitab           = _mm_cvttps_epi32(rt);
1154 #ifdef __XOP__
1155             vfeps            = _mm_frcz_ps(rt);
1156 #else
1157             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1158 #endif
1159             twovfeps         = _mm_add_ps(vfeps,vfeps);
1160             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1161
1162             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1163             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1164             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1165             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1166             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1167             _MM_TRANSPOSE4_PS(Y,F,G,H);
1168             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1169             VV               = _mm_macc_ps(vfeps,Fp,Y);
1170             velec            = _mm_mul_ps(qq23,VV);
1171             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1172             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1173
1174             /* Update potential sum for this i atom from the interaction with this j atom. */
1175             velec            = _mm_andnot_ps(dummy_mask,velec);
1176             velecsum         = _mm_add_ps(velecsum,velec);
1177
1178             fscal            = felec;
1179
1180             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1181
1182              /* Update vectorial force */
1183             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1184             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1185             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1186
1187             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1188             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1189             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1190
1191             /**************************
1192              * CALCULATE INTERACTIONS *
1193              **************************/
1194
1195             r31              = _mm_mul_ps(rsq31,rinv31);
1196             r31              = _mm_andnot_ps(dummy_mask,r31);
1197
1198             /* Calculate table index by multiplying r with table scale and truncate to integer */
1199             rt               = _mm_mul_ps(r31,vftabscale);
1200             vfitab           = _mm_cvttps_epi32(rt);
1201 #ifdef __XOP__
1202             vfeps            = _mm_frcz_ps(rt);
1203 #else
1204             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1205 #endif
1206             twovfeps         = _mm_add_ps(vfeps,vfeps);
1207             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1208
1209             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1210             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1211             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1212             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1213             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1214             _MM_TRANSPOSE4_PS(Y,F,G,H);
1215             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1216             VV               = _mm_macc_ps(vfeps,Fp,Y);
1217             velec            = _mm_mul_ps(qq31,VV);
1218             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1219             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1220
1221             /* Update potential sum for this i atom from the interaction with this j atom. */
1222             velec            = _mm_andnot_ps(dummy_mask,velec);
1223             velecsum         = _mm_add_ps(velecsum,velec);
1224
1225             fscal            = felec;
1226
1227             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1228
1229              /* Update vectorial force */
1230             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1231             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1232             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1233
1234             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1235             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1236             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1237
1238             /**************************
1239              * CALCULATE INTERACTIONS *
1240              **************************/
1241
1242             r32              = _mm_mul_ps(rsq32,rinv32);
1243             r32              = _mm_andnot_ps(dummy_mask,r32);
1244
1245             /* Calculate table index by multiplying r with table scale and truncate to integer */
1246             rt               = _mm_mul_ps(r32,vftabscale);
1247             vfitab           = _mm_cvttps_epi32(rt);
1248 #ifdef __XOP__
1249             vfeps            = _mm_frcz_ps(rt);
1250 #else
1251             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1252 #endif
1253             twovfeps         = _mm_add_ps(vfeps,vfeps);
1254             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1255
1256             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1257             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1258             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1259             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1260             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1261             _MM_TRANSPOSE4_PS(Y,F,G,H);
1262             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1263             VV               = _mm_macc_ps(vfeps,Fp,Y);
1264             velec            = _mm_mul_ps(qq32,VV);
1265             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1266             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1267
1268             /* Update potential sum for this i atom from the interaction with this j atom. */
1269             velec            = _mm_andnot_ps(dummy_mask,velec);
1270             velecsum         = _mm_add_ps(velecsum,velec);
1271
1272             fscal            = felec;
1273
1274             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1275
1276              /* Update vectorial force */
1277             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1278             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1279             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1280
1281             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1282             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1283             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1284
1285             /**************************
1286              * CALCULATE INTERACTIONS *
1287              **************************/
1288
1289             r33              = _mm_mul_ps(rsq33,rinv33);
1290             r33              = _mm_andnot_ps(dummy_mask,r33);
1291
1292             /* Calculate table index by multiplying r with table scale and truncate to integer */
1293             rt               = _mm_mul_ps(r33,vftabscale);
1294             vfitab           = _mm_cvttps_epi32(rt);
1295 #ifdef __XOP__
1296             vfeps            = _mm_frcz_ps(rt);
1297 #else
1298             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1299 #endif
1300             twovfeps         = _mm_add_ps(vfeps,vfeps);
1301             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1302
1303             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1304             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1305             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1306             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1307             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1308             _MM_TRANSPOSE4_PS(Y,F,G,H);
1309             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1310             VV               = _mm_macc_ps(vfeps,Fp,Y);
1311             velec            = _mm_mul_ps(qq33,VV);
1312             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1313             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1314
1315             /* Update potential sum for this i atom from the interaction with this j atom. */
1316             velec            = _mm_andnot_ps(dummy_mask,velec);
1317             velecsum         = _mm_add_ps(velecsum,velec);
1318
1319             fscal            = felec;
1320
1321             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1322
1323              /* Update vectorial force */
1324             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1325             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1326             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1327
1328             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1329             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1330             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1331
1332             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1333             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1334             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1335             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1336
1337             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1338                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1339                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1340
1341             /* Inner loop uses 486 flops */
1342         }
1343
1344         /* End of innermost loop */
1345
1346         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1347                                               f+i_coord_offset,fshift+i_shift_offset);
1348
1349         ggid                        = gid[iidx];
1350         /* Update potential energies */
1351         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1352         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1353
1354         /* Increment number of inner iterations */
1355         inneriter                  += j_index_end - j_index_start;
1356
1357         /* Outer loop uses 26 flops */
1358     }
1359
1360     /* Increment number of outer iterations */
1361     outeriter        += nri;
1362
1363     /* Update outer/inner flops */
1364
1365     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*486);
1366 }
1367 /*
1368  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_single
1369  * Electrostatics interaction: CubicSplineTable
1370  * VdW interaction:            CubicSplineTable
1371  * Geometry:                   Water4-Water4
1372  * Calculate force/pot:        Force
1373  */
1374 void
1375 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_single
1376                     (t_nblist * gmx_restrict                nlist,
1377                      rvec * gmx_restrict                    xx,
1378                      rvec * gmx_restrict                    ff,
1379                      t_forcerec * gmx_restrict              fr,
1380                      t_mdatoms * gmx_restrict               mdatoms,
1381                      nb_kernel_data_t * gmx_restrict        kernel_data,
1382                      t_nrnb * gmx_restrict                  nrnb)
1383 {
1384     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1385      * just 0 for non-waters.
1386      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1387      * jnr indices corresponding to data put in the four positions in the SIMD register.
1388      */
1389     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1390     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1391     int              jnrA,jnrB,jnrC,jnrD;
1392     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1393     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1394     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1395     real             rcutoff_scalar;
1396     real             *shiftvec,*fshift,*x,*f;
1397     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1398     real             scratch[4*DIM];
1399     __m128           fscal,rcutoff,rcutoff2,jidxall;
1400     int              vdwioffset0;
1401     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1402     int              vdwioffset1;
1403     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1404     int              vdwioffset2;
1405     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1406     int              vdwioffset3;
1407     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1408     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1409     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1410     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1411     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1412     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1413     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1414     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1415     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1416     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1417     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1418     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1419     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1420     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1421     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1422     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1423     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1424     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1425     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1426     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1427     real             *charge;
1428     int              nvdwtype;
1429     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1430     int              *vdwtype;
1431     real             *vdwparam;
1432     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1433     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1434     __m128i          vfitab;
1435     __m128i          ifour       = _mm_set1_epi32(4);
1436     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1437     real             *vftab;
1438     __m128           dummy_mask,cutoff_mask;
1439     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1440     __m128           one     = _mm_set1_ps(1.0);
1441     __m128           two     = _mm_set1_ps(2.0);
1442     x                = xx[0];
1443     f                = ff[0];
1444
1445     nri              = nlist->nri;
1446     iinr             = nlist->iinr;
1447     jindex           = nlist->jindex;
1448     jjnr             = nlist->jjnr;
1449     shiftidx         = nlist->shift;
1450     gid              = nlist->gid;
1451     shiftvec         = fr->shift_vec[0];
1452     fshift           = fr->fshift[0];
1453     facel            = _mm_set1_ps(fr->epsfac);
1454     charge           = mdatoms->chargeA;
1455     nvdwtype         = fr->ntype;
1456     vdwparam         = fr->nbfp;
1457     vdwtype          = mdatoms->typeA;
1458
1459     vftab            = kernel_data->table_elec_vdw->data;
1460     vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
1461
1462     /* Setup water-specific parameters */
1463     inr              = nlist->iinr[0];
1464     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1465     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1466     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1467     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1468
1469     jq1              = _mm_set1_ps(charge[inr+1]);
1470     jq2              = _mm_set1_ps(charge[inr+2]);
1471     jq3              = _mm_set1_ps(charge[inr+3]);
1472     vdwjidx0A        = 2*vdwtype[inr+0];
1473     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1474     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1475     qq11             = _mm_mul_ps(iq1,jq1);
1476     qq12             = _mm_mul_ps(iq1,jq2);
1477     qq13             = _mm_mul_ps(iq1,jq3);
1478     qq21             = _mm_mul_ps(iq2,jq1);
1479     qq22             = _mm_mul_ps(iq2,jq2);
1480     qq23             = _mm_mul_ps(iq2,jq3);
1481     qq31             = _mm_mul_ps(iq3,jq1);
1482     qq32             = _mm_mul_ps(iq3,jq2);
1483     qq33             = _mm_mul_ps(iq3,jq3);
1484
1485     /* Avoid stupid compiler warnings */
1486     jnrA = jnrB = jnrC = jnrD = 0;
1487     j_coord_offsetA = 0;
1488     j_coord_offsetB = 0;
1489     j_coord_offsetC = 0;
1490     j_coord_offsetD = 0;
1491
1492     outeriter        = 0;
1493     inneriter        = 0;
1494
1495     for(iidx=0;iidx<4*DIM;iidx++)
1496     {
1497         scratch[iidx] = 0.0;
1498     }
1499
1500     /* Start outer loop over neighborlists */
1501     for(iidx=0; iidx<nri; iidx++)
1502     {
1503         /* Load shift vector for this list */
1504         i_shift_offset   = DIM*shiftidx[iidx];
1505
1506         /* Load limits for loop over neighbors */
1507         j_index_start    = jindex[iidx];
1508         j_index_end      = jindex[iidx+1];
1509
1510         /* Get outer coordinate index */
1511         inr              = iinr[iidx];
1512         i_coord_offset   = DIM*inr;
1513
1514         /* Load i particle coords and add shift vector */
1515         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1516                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1517
1518         fix0             = _mm_setzero_ps();
1519         fiy0             = _mm_setzero_ps();
1520         fiz0             = _mm_setzero_ps();
1521         fix1             = _mm_setzero_ps();
1522         fiy1             = _mm_setzero_ps();
1523         fiz1             = _mm_setzero_ps();
1524         fix2             = _mm_setzero_ps();
1525         fiy2             = _mm_setzero_ps();
1526         fiz2             = _mm_setzero_ps();
1527         fix3             = _mm_setzero_ps();
1528         fiy3             = _mm_setzero_ps();
1529         fiz3             = _mm_setzero_ps();
1530
1531         /* Start inner kernel loop */
1532         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1533         {
1534
1535             /* Get j neighbor index, and coordinate index */
1536             jnrA             = jjnr[jidx];
1537             jnrB             = jjnr[jidx+1];
1538             jnrC             = jjnr[jidx+2];
1539             jnrD             = jjnr[jidx+3];
1540             j_coord_offsetA  = DIM*jnrA;
1541             j_coord_offsetB  = DIM*jnrB;
1542             j_coord_offsetC  = DIM*jnrC;
1543             j_coord_offsetD  = DIM*jnrD;
1544
1545             /* load j atom coordinates */
1546             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1547                                               x+j_coord_offsetC,x+j_coord_offsetD,
1548                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1549                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1550
1551             /* Calculate displacement vector */
1552             dx00             = _mm_sub_ps(ix0,jx0);
1553             dy00             = _mm_sub_ps(iy0,jy0);
1554             dz00             = _mm_sub_ps(iz0,jz0);
1555             dx11             = _mm_sub_ps(ix1,jx1);
1556             dy11             = _mm_sub_ps(iy1,jy1);
1557             dz11             = _mm_sub_ps(iz1,jz1);
1558             dx12             = _mm_sub_ps(ix1,jx2);
1559             dy12             = _mm_sub_ps(iy1,jy2);
1560             dz12             = _mm_sub_ps(iz1,jz2);
1561             dx13             = _mm_sub_ps(ix1,jx3);
1562             dy13             = _mm_sub_ps(iy1,jy3);
1563             dz13             = _mm_sub_ps(iz1,jz3);
1564             dx21             = _mm_sub_ps(ix2,jx1);
1565             dy21             = _mm_sub_ps(iy2,jy1);
1566             dz21             = _mm_sub_ps(iz2,jz1);
1567             dx22             = _mm_sub_ps(ix2,jx2);
1568             dy22             = _mm_sub_ps(iy2,jy2);
1569             dz22             = _mm_sub_ps(iz2,jz2);
1570             dx23             = _mm_sub_ps(ix2,jx3);
1571             dy23             = _mm_sub_ps(iy2,jy3);
1572             dz23             = _mm_sub_ps(iz2,jz3);
1573             dx31             = _mm_sub_ps(ix3,jx1);
1574             dy31             = _mm_sub_ps(iy3,jy1);
1575             dz31             = _mm_sub_ps(iz3,jz1);
1576             dx32             = _mm_sub_ps(ix3,jx2);
1577             dy32             = _mm_sub_ps(iy3,jy2);
1578             dz32             = _mm_sub_ps(iz3,jz2);
1579             dx33             = _mm_sub_ps(ix3,jx3);
1580             dy33             = _mm_sub_ps(iy3,jy3);
1581             dz33             = _mm_sub_ps(iz3,jz3);
1582
1583             /* Calculate squared distance and things based on it */
1584             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1585             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1586             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1587             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1588             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1589             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1590             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1591             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1592             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1593             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1594
1595             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1596             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1597             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1598             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1599             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1600             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1601             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1602             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1603             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1604             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1605
1606             fjx0             = _mm_setzero_ps();
1607             fjy0             = _mm_setzero_ps();
1608             fjz0             = _mm_setzero_ps();
1609             fjx1             = _mm_setzero_ps();
1610             fjy1             = _mm_setzero_ps();
1611             fjz1             = _mm_setzero_ps();
1612             fjx2             = _mm_setzero_ps();
1613             fjy2             = _mm_setzero_ps();
1614             fjz2             = _mm_setzero_ps();
1615             fjx3             = _mm_setzero_ps();
1616             fjy3             = _mm_setzero_ps();
1617             fjz3             = _mm_setzero_ps();
1618
1619             /**************************
1620              * CALCULATE INTERACTIONS *
1621              **************************/
1622
1623             r00              = _mm_mul_ps(rsq00,rinv00);
1624
1625             /* Calculate table index by multiplying r with table scale and truncate to integer */
1626             rt               = _mm_mul_ps(r00,vftabscale);
1627             vfitab           = _mm_cvttps_epi32(rt);
1628 #ifdef __XOP__
1629             vfeps            = _mm_frcz_ps(rt);
1630 #else
1631             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1632 #endif
1633             twovfeps         = _mm_add_ps(vfeps,vfeps);
1634             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1635
1636             /* CUBIC SPLINE TABLE DISPERSION */
1637             vfitab           = _mm_add_epi32(vfitab,ifour);
1638             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1639             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1640             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1641             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1642             _MM_TRANSPOSE4_PS(Y,F,G,H);
1643             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1644             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1645             fvdw6            = _mm_mul_ps(c6_00,FF);
1646
1647             /* CUBIC SPLINE TABLE REPULSION */
1648             vfitab           = _mm_add_epi32(vfitab,ifour);
1649             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1650             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1651             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1652             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1653             _MM_TRANSPOSE4_PS(Y,F,G,H);
1654             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1655             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1656             fvdw12           = _mm_mul_ps(c12_00,FF);
1657             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1658
1659             fscal            = fvdw;
1660
1661              /* Update vectorial force */
1662             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1663             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1664             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1665
1666             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1667             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1668             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1669
1670             /**************************
1671              * CALCULATE INTERACTIONS *
1672              **************************/
1673
1674             r11              = _mm_mul_ps(rsq11,rinv11);
1675
1676             /* Calculate table index by multiplying r with table scale and truncate to integer */
1677             rt               = _mm_mul_ps(r11,vftabscale);
1678             vfitab           = _mm_cvttps_epi32(rt);
1679 #ifdef __XOP__
1680             vfeps            = _mm_frcz_ps(rt);
1681 #else
1682             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1683 #endif
1684             twovfeps         = _mm_add_ps(vfeps,vfeps);
1685             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1686
1687             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1688             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1689             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1690             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1691             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1692             _MM_TRANSPOSE4_PS(Y,F,G,H);
1693             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1694             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1695             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1696
1697             fscal            = felec;
1698
1699              /* Update vectorial force */
1700             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1701             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1702             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1703
1704             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1705             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1706             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1707
1708             /**************************
1709              * CALCULATE INTERACTIONS *
1710              **************************/
1711
1712             r12              = _mm_mul_ps(rsq12,rinv12);
1713
1714             /* Calculate table index by multiplying r with table scale and truncate to integer */
1715             rt               = _mm_mul_ps(r12,vftabscale);
1716             vfitab           = _mm_cvttps_epi32(rt);
1717 #ifdef __XOP__
1718             vfeps            = _mm_frcz_ps(rt);
1719 #else
1720             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1721 #endif
1722             twovfeps         = _mm_add_ps(vfeps,vfeps);
1723             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1724
1725             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1726             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1727             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1728             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1729             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1730             _MM_TRANSPOSE4_PS(Y,F,G,H);
1731             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1732             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1733             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1734
1735             fscal            = felec;
1736
1737              /* Update vectorial force */
1738             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1739             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1740             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1741
1742             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1743             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1744             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1745
1746             /**************************
1747              * CALCULATE INTERACTIONS *
1748              **************************/
1749
1750             r13              = _mm_mul_ps(rsq13,rinv13);
1751
1752             /* Calculate table index by multiplying r with table scale and truncate to integer */
1753             rt               = _mm_mul_ps(r13,vftabscale);
1754             vfitab           = _mm_cvttps_epi32(rt);
1755 #ifdef __XOP__
1756             vfeps            = _mm_frcz_ps(rt);
1757 #else
1758             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1759 #endif
1760             twovfeps         = _mm_add_ps(vfeps,vfeps);
1761             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1762
1763             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1764             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1765             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1766             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1767             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1768             _MM_TRANSPOSE4_PS(Y,F,G,H);
1769             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1770             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1771             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1772
1773             fscal            = felec;
1774
1775              /* Update vectorial force */
1776             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1777             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1778             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1779
1780             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1781             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1782             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1783
1784             /**************************
1785              * CALCULATE INTERACTIONS *
1786              **************************/
1787
1788             r21              = _mm_mul_ps(rsq21,rinv21);
1789
1790             /* Calculate table index by multiplying r with table scale and truncate to integer */
1791             rt               = _mm_mul_ps(r21,vftabscale);
1792             vfitab           = _mm_cvttps_epi32(rt);
1793 #ifdef __XOP__
1794             vfeps            = _mm_frcz_ps(rt);
1795 #else
1796             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1797 #endif
1798             twovfeps         = _mm_add_ps(vfeps,vfeps);
1799             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1800
1801             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1802             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1803             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1804             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1805             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1806             _MM_TRANSPOSE4_PS(Y,F,G,H);
1807             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1808             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1809             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1810
1811             fscal            = felec;
1812
1813              /* Update vectorial force */
1814             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1815             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1816             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1817
1818             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1819             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1820             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1821
1822             /**************************
1823              * CALCULATE INTERACTIONS *
1824              **************************/
1825
1826             r22              = _mm_mul_ps(rsq22,rinv22);
1827
1828             /* Calculate table index by multiplying r with table scale and truncate to integer */
1829             rt               = _mm_mul_ps(r22,vftabscale);
1830             vfitab           = _mm_cvttps_epi32(rt);
1831 #ifdef __XOP__
1832             vfeps            = _mm_frcz_ps(rt);
1833 #else
1834             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1835 #endif
1836             twovfeps         = _mm_add_ps(vfeps,vfeps);
1837             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1838
1839             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1840             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1841             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1842             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1843             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1844             _MM_TRANSPOSE4_PS(Y,F,G,H);
1845             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1846             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1847             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1848
1849             fscal            = felec;
1850
1851              /* Update vectorial force */
1852             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1853             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1854             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1855
1856             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1857             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1858             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1859
1860             /**************************
1861              * CALCULATE INTERACTIONS *
1862              **************************/
1863
1864             r23              = _mm_mul_ps(rsq23,rinv23);
1865
1866             /* Calculate table index by multiplying r with table scale and truncate to integer */
1867             rt               = _mm_mul_ps(r23,vftabscale);
1868             vfitab           = _mm_cvttps_epi32(rt);
1869 #ifdef __XOP__
1870             vfeps            = _mm_frcz_ps(rt);
1871 #else
1872             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1873 #endif
1874             twovfeps         = _mm_add_ps(vfeps,vfeps);
1875             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1876
1877             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1878             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1879             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1880             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1881             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1882             _MM_TRANSPOSE4_PS(Y,F,G,H);
1883             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1884             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1885             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1886
1887             fscal            = felec;
1888
1889              /* Update vectorial force */
1890             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1891             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1892             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1893
1894             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1895             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1896             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1897
1898             /**************************
1899              * CALCULATE INTERACTIONS *
1900              **************************/
1901
1902             r31              = _mm_mul_ps(rsq31,rinv31);
1903
1904             /* Calculate table index by multiplying r with table scale and truncate to integer */
1905             rt               = _mm_mul_ps(r31,vftabscale);
1906             vfitab           = _mm_cvttps_epi32(rt);
1907 #ifdef __XOP__
1908             vfeps            = _mm_frcz_ps(rt);
1909 #else
1910             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1911 #endif
1912             twovfeps         = _mm_add_ps(vfeps,vfeps);
1913             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1914
1915             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1916             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1917             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1918             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1919             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1920             _MM_TRANSPOSE4_PS(Y,F,G,H);
1921             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1922             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1923             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1924
1925             fscal            = felec;
1926
1927              /* Update vectorial force */
1928             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1929             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1930             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1931
1932             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1933             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1934             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1935
1936             /**************************
1937              * CALCULATE INTERACTIONS *
1938              **************************/
1939
1940             r32              = _mm_mul_ps(rsq32,rinv32);
1941
1942             /* Calculate table index by multiplying r with table scale and truncate to integer */
1943             rt               = _mm_mul_ps(r32,vftabscale);
1944             vfitab           = _mm_cvttps_epi32(rt);
1945 #ifdef __XOP__
1946             vfeps            = _mm_frcz_ps(rt);
1947 #else
1948             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1949 #endif
1950             twovfeps         = _mm_add_ps(vfeps,vfeps);
1951             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1952
1953             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1954             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1955             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1956             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1957             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1958             _MM_TRANSPOSE4_PS(Y,F,G,H);
1959             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1960             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1961             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1962
1963             fscal            = felec;
1964
1965              /* Update vectorial force */
1966             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1967             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1968             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1969
1970             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1971             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1972             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1973
1974             /**************************
1975              * CALCULATE INTERACTIONS *
1976              **************************/
1977
1978             r33              = _mm_mul_ps(rsq33,rinv33);
1979
1980             /* Calculate table index by multiplying r with table scale and truncate to integer */
1981             rt               = _mm_mul_ps(r33,vftabscale);
1982             vfitab           = _mm_cvttps_epi32(rt);
1983 #ifdef __XOP__
1984             vfeps            = _mm_frcz_ps(rt);
1985 #else
1986             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1987 #endif
1988             twovfeps         = _mm_add_ps(vfeps,vfeps);
1989             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1990
1991             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1992             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1993             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1994             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1995             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1996             _MM_TRANSPOSE4_PS(Y,F,G,H);
1997             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1998             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1999             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2000
2001             fscal            = felec;
2002
2003              /* Update vectorial force */
2004             fix3             = _mm_macc_ps(dx33,fscal,fix3);
2005             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
2006             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
2007
2008             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
2009             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
2010             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
2011
2012             fjptrA             = f+j_coord_offsetA;
2013             fjptrB             = f+j_coord_offsetB;
2014             fjptrC             = f+j_coord_offsetC;
2015             fjptrD             = f+j_coord_offsetD;
2016
2017             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2018                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2019                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2020
2021             /* Inner loop uses 432 flops */
2022         }
2023
2024         if(jidx<j_index_end)
2025         {
2026
2027             /* Get j neighbor index, and coordinate index */
2028             jnrlistA         = jjnr[jidx];
2029             jnrlistB         = jjnr[jidx+1];
2030             jnrlistC         = jjnr[jidx+2];
2031             jnrlistD         = jjnr[jidx+3];
2032             /* Sign of each element will be negative for non-real atoms.
2033              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2034              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2035              */
2036             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2037             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2038             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2039             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2040             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2041             j_coord_offsetA  = DIM*jnrA;
2042             j_coord_offsetB  = DIM*jnrB;
2043             j_coord_offsetC  = DIM*jnrC;
2044             j_coord_offsetD  = DIM*jnrD;
2045
2046             /* load j atom coordinates */
2047             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2048                                               x+j_coord_offsetC,x+j_coord_offsetD,
2049                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2050                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2051
2052             /* Calculate displacement vector */
2053             dx00             = _mm_sub_ps(ix0,jx0);
2054             dy00             = _mm_sub_ps(iy0,jy0);
2055             dz00             = _mm_sub_ps(iz0,jz0);
2056             dx11             = _mm_sub_ps(ix1,jx1);
2057             dy11             = _mm_sub_ps(iy1,jy1);
2058             dz11             = _mm_sub_ps(iz1,jz1);
2059             dx12             = _mm_sub_ps(ix1,jx2);
2060             dy12             = _mm_sub_ps(iy1,jy2);
2061             dz12             = _mm_sub_ps(iz1,jz2);
2062             dx13             = _mm_sub_ps(ix1,jx3);
2063             dy13             = _mm_sub_ps(iy1,jy3);
2064             dz13             = _mm_sub_ps(iz1,jz3);
2065             dx21             = _mm_sub_ps(ix2,jx1);
2066             dy21             = _mm_sub_ps(iy2,jy1);
2067             dz21             = _mm_sub_ps(iz2,jz1);
2068             dx22             = _mm_sub_ps(ix2,jx2);
2069             dy22             = _mm_sub_ps(iy2,jy2);
2070             dz22             = _mm_sub_ps(iz2,jz2);
2071             dx23             = _mm_sub_ps(ix2,jx3);
2072             dy23             = _mm_sub_ps(iy2,jy3);
2073             dz23             = _mm_sub_ps(iz2,jz3);
2074             dx31             = _mm_sub_ps(ix3,jx1);
2075             dy31             = _mm_sub_ps(iy3,jy1);
2076             dz31             = _mm_sub_ps(iz3,jz1);
2077             dx32             = _mm_sub_ps(ix3,jx2);
2078             dy32             = _mm_sub_ps(iy3,jy2);
2079             dz32             = _mm_sub_ps(iz3,jz2);
2080             dx33             = _mm_sub_ps(ix3,jx3);
2081             dy33             = _mm_sub_ps(iy3,jy3);
2082             dz33             = _mm_sub_ps(iz3,jz3);
2083
2084             /* Calculate squared distance and things based on it */
2085             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2086             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2087             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2088             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2089             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2090             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2091             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2092             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2093             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2094             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2095
2096             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2097             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2098             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2099             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2100             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2101             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2102             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2103             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2104             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2105             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2106
2107             fjx0             = _mm_setzero_ps();
2108             fjy0             = _mm_setzero_ps();
2109             fjz0             = _mm_setzero_ps();
2110             fjx1             = _mm_setzero_ps();
2111             fjy1             = _mm_setzero_ps();
2112             fjz1             = _mm_setzero_ps();
2113             fjx2             = _mm_setzero_ps();
2114             fjy2             = _mm_setzero_ps();
2115             fjz2             = _mm_setzero_ps();
2116             fjx3             = _mm_setzero_ps();
2117             fjy3             = _mm_setzero_ps();
2118             fjz3             = _mm_setzero_ps();
2119
2120             /**************************
2121              * CALCULATE INTERACTIONS *
2122              **************************/
2123
2124             r00              = _mm_mul_ps(rsq00,rinv00);
2125             r00              = _mm_andnot_ps(dummy_mask,r00);
2126
2127             /* Calculate table index by multiplying r with table scale and truncate to integer */
2128             rt               = _mm_mul_ps(r00,vftabscale);
2129             vfitab           = _mm_cvttps_epi32(rt);
2130 #ifdef __XOP__
2131             vfeps            = _mm_frcz_ps(rt);
2132 #else
2133             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2134 #endif
2135             twovfeps         = _mm_add_ps(vfeps,vfeps);
2136             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2137
2138             /* CUBIC SPLINE TABLE DISPERSION */
2139             vfitab           = _mm_add_epi32(vfitab,ifour);
2140             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2141             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2142             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2143             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2144             _MM_TRANSPOSE4_PS(Y,F,G,H);
2145             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2146             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2147             fvdw6            = _mm_mul_ps(c6_00,FF);
2148
2149             /* CUBIC SPLINE TABLE REPULSION */
2150             vfitab           = _mm_add_epi32(vfitab,ifour);
2151             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2152             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2153             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2154             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2155             _MM_TRANSPOSE4_PS(Y,F,G,H);
2156             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2157             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2158             fvdw12           = _mm_mul_ps(c12_00,FF);
2159             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2160
2161             fscal            = fvdw;
2162
2163             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2164
2165              /* Update vectorial force */
2166             fix0             = _mm_macc_ps(dx00,fscal,fix0);
2167             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
2168             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
2169
2170             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
2171             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
2172             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
2173
2174             /**************************
2175              * CALCULATE INTERACTIONS *
2176              **************************/
2177
2178             r11              = _mm_mul_ps(rsq11,rinv11);
2179             r11              = _mm_andnot_ps(dummy_mask,r11);
2180
2181             /* Calculate table index by multiplying r with table scale and truncate to integer */
2182             rt               = _mm_mul_ps(r11,vftabscale);
2183             vfitab           = _mm_cvttps_epi32(rt);
2184 #ifdef __XOP__
2185             vfeps            = _mm_frcz_ps(rt);
2186 #else
2187             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2188 #endif
2189             twovfeps         = _mm_add_ps(vfeps,vfeps);
2190             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2191
2192             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2193             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2194             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2195             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2196             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2197             _MM_TRANSPOSE4_PS(Y,F,G,H);
2198             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2199             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2200             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2201
2202             fscal            = felec;
2203
2204             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2205
2206              /* Update vectorial force */
2207             fix1             = _mm_macc_ps(dx11,fscal,fix1);
2208             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
2209             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
2210
2211             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
2212             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
2213             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
2214
2215             /**************************
2216              * CALCULATE INTERACTIONS *
2217              **************************/
2218
2219             r12              = _mm_mul_ps(rsq12,rinv12);
2220             r12              = _mm_andnot_ps(dummy_mask,r12);
2221
2222             /* Calculate table index by multiplying r with table scale and truncate to integer */
2223             rt               = _mm_mul_ps(r12,vftabscale);
2224             vfitab           = _mm_cvttps_epi32(rt);
2225 #ifdef __XOP__
2226             vfeps            = _mm_frcz_ps(rt);
2227 #else
2228             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2229 #endif
2230             twovfeps         = _mm_add_ps(vfeps,vfeps);
2231             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2232
2233             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2234             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2235             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2236             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2237             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2238             _MM_TRANSPOSE4_PS(Y,F,G,H);
2239             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2240             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2241             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2242
2243             fscal            = felec;
2244
2245             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2246
2247              /* Update vectorial force */
2248             fix1             = _mm_macc_ps(dx12,fscal,fix1);
2249             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
2250             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
2251
2252             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
2253             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
2254             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
2255
2256             /**************************
2257              * CALCULATE INTERACTIONS *
2258              **************************/
2259
2260             r13              = _mm_mul_ps(rsq13,rinv13);
2261             r13              = _mm_andnot_ps(dummy_mask,r13);
2262
2263             /* Calculate table index by multiplying r with table scale and truncate to integer */
2264             rt               = _mm_mul_ps(r13,vftabscale);
2265             vfitab           = _mm_cvttps_epi32(rt);
2266 #ifdef __XOP__
2267             vfeps            = _mm_frcz_ps(rt);
2268 #else
2269             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2270 #endif
2271             twovfeps         = _mm_add_ps(vfeps,vfeps);
2272             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2273
2274             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2275             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2276             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2277             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2278             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2279             _MM_TRANSPOSE4_PS(Y,F,G,H);
2280             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2281             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2282             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
2283
2284             fscal            = felec;
2285
2286             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2287
2288              /* Update vectorial force */
2289             fix1             = _mm_macc_ps(dx13,fscal,fix1);
2290             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
2291             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
2292
2293             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
2294             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
2295             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
2296
2297             /**************************
2298              * CALCULATE INTERACTIONS *
2299              **************************/
2300
2301             r21              = _mm_mul_ps(rsq21,rinv21);
2302             r21              = _mm_andnot_ps(dummy_mask,r21);
2303
2304             /* Calculate table index by multiplying r with table scale and truncate to integer */
2305             rt               = _mm_mul_ps(r21,vftabscale);
2306             vfitab           = _mm_cvttps_epi32(rt);
2307 #ifdef __XOP__
2308             vfeps            = _mm_frcz_ps(rt);
2309 #else
2310             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2311 #endif
2312             twovfeps         = _mm_add_ps(vfeps,vfeps);
2313             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2314
2315             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2316             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2317             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2318             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2319             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2320             _MM_TRANSPOSE4_PS(Y,F,G,H);
2321             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2322             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2323             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2324
2325             fscal            = felec;
2326
2327             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2328
2329              /* Update vectorial force */
2330             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2331             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2332             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2333
2334             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2335             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2336             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2337
2338             /**************************
2339              * CALCULATE INTERACTIONS *
2340              **************************/
2341
2342             r22              = _mm_mul_ps(rsq22,rinv22);
2343             r22              = _mm_andnot_ps(dummy_mask,r22);
2344
2345             /* Calculate table index by multiplying r with table scale and truncate to integer */
2346             rt               = _mm_mul_ps(r22,vftabscale);
2347             vfitab           = _mm_cvttps_epi32(rt);
2348 #ifdef __XOP__
2349             vfeps            = _mm_frcz_ps(rt);
2350 #else
2351             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2352 #endif
2353             twovfeps         = _mm_add_ps(vfeps,vfeps);
2354             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2355
2356             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2357             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2358             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2359             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2360             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2361             _MM_TRANSPOSE4_PS(Y,F,G,H);
2362             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2363             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2364             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2365
2366             fscal            = felec;
2367
2368             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2369
2370              /* Update vectorial force */
2371             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2372             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2373             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2374
2375             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2376             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2377             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2378
2379             /**************************
2380              * CALCULATE INTERACTIONS *
2381              **************************/
2382
2383             r23              = _mm_mul_ps(rsq23,rinv23);
2384             r23              = _mm_andnot_ps(dummy_mask,r23);
2385
2386             /* Calculate table index by multiplying r with table scale and truncate to integer */
2387             rt               = _mm_mul_ps(r23,vftabscale);
2388             vfitab           = _mm_cvttps_epi32(rt);
2389 #ifdef __XOP__
2390             vfeps            = _mm_frcz_ps(rt);
2391 #else
2392             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2393 #endif
2394             twovfeps         = _mm_add_ps(vfeps,vfeps);
2395             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2396
2397             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2398             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2399             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2400             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2401             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2402             _MM_TRANSPOSE4_PS(Y,F,G,H);
2403             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2404             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2405             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
2406
2407             fscal            = felec;
2408
2409             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2410
2411              /* Update vectorial force */
2412             fix2             = _mm_macc_ps(dx23,fscal,fix2);
2413             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
2414             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
2415
2416             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
2417             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
2418             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
2419
2420             /**************************
2421              * CALCULATE INTERACTIONS *
2422              **************************/
2423
2424             r31              = _mm_mul_ps(rsq31,rinv31);
2425             r31              = _mm_andnot_ps(dummy_mask,r31);
2426
2427             /* Calculate table index by multiplying r with table scale and truncate to integer */
2428             rt               = _mm_mul_ps(r31,vftabscale);
2429             vfitab           = _mm_cvttps_epi32(rt);
2430 #ifdef __XOP__
2431             vfeps            = _mm_frcz_ps(rt);
2432 #else
2433             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2434 #endif
2435             twovfeps         = _mm_add_ps(vfeps,vfeps);
2436             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2437
2438             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2439             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2440             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2441             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2442             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2443             _MM_TRANSPOSE4_PS(Y,F,G,H);
2444             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2445             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2446             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
2447
2448             fscal            = felec;
2449
2450             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2451
2452              /* Update vectorial force */
2453             fix3             = _mm_macc_ps(dx31,fscal,fix3);
2454             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
2455             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
2456
2457             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
2458             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
2459             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
2460
2461             /**************************
2462              * CALCULATE INTERACTIONS *
2463              **************************/
2464
2465             r32              = _mm_mul_ps(rsq32,rinv32);
2466             r32              = _mm_andnot_ps(dummy_mask,r32);
2467
2468             /* Calculate table index by multiplying r with table scale and truncate to integer */
2469             rt               = _mm_mul_ps(r32,vftabscale);
2470             vfitab           = _mm_cvttps_epi32(rt);
2471 #ifdef __XOP__
2472             vfeps            = _mm_frcz_ps(rt);
2473 #else
2474             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2475 #endif
2476             twovfeps         = _mm_add_ps(vfeps,vfeps);
2477             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2478
2479             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2480             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2481             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2482             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2483             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2484             _MM_TRANSPOSE4_PS(Y,F,G,H);
2485             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2486             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2487             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
2488
2489             fscal            = felec;
2490
2491             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2492
2493              /* Update vectorial force */
2494             fix3             = _mm_macc_ps(dx32,fscal,fix3);
2495             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
2496             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
2497
2498             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
2499             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
2500             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
2501
2502             /**************************
2503              * CALCULATE INTERACTIONS *
2504              **************************/
2505
2506             r33              = _mm_mul_ps(rsq33,rinv33);
2507             r33              = _mm_andnot_ps(dummy_mask,r33);
2508
2509             /* Calculate table index by multiplying r with table scale and truncate to integer */
2510             rt               = _mm_mul_ps(r33,vftabscale);
2511             vfitab           = _mm_cvttps_epi32(rt);
2512 #ifdef __XOP__
2513             vfeps            = _mm_frcz_ps(rt);
2514 #else
2515             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2516 #endif
2517             twovfeps         = _mm_add_ps(vfeps,vfeps);
2518             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2519
2520             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2521             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2522             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2523             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2524             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2525             _MM_TRANSPOSE4_PS(Y,F,G,H);
2526             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2527             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2528             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2529
2530             fscal            = felec;
2531
2532             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2533
2534              /* Update vectorial force */
2535             fix3             = _mm_macc_ps(dx33,fscal,fix3);
2536             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
2537             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
2538
2539             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
2540             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
2541             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
2542
2543             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2544             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2545             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2546             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2547
2548             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2549                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2550                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2551
2552             /* Inner loop uses 442 flops */
2553         }
2554
2555         /* End of innermost loop */
2556
2557         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2558                                               f+i_coord_offset,fshift+i_shift_offset);
2559
2560         /* Increment number of inner iterations */
2561         inneriter                  += j_index_end - j_index_start;
2562
2563         /* Outer loop uses 24 flops */
2564     }
2565
2566     /* Increment number of outer iterations */
2567     outeriter        += nri;
2568
2569     /* Update outer/inner flops */
2570
2571     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*442);
2572 }