a501f770d9109bc20fdcf7820e223ef1e2682039
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_single
52  * Electrostatics interaction: CubicSplineTable
53  * VdW interaction:            CubicSplineTable
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwioffset3;
90     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
110     real             *charge;
111     int              nvdwtype;
112     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113     int              *vdwtype;
114     real             *vdwparam;
115     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
116     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
117     __m128i          vfitab;
118     __m128i          ifour       = _mm_set1_epi32(4);
119     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
120     real             *vftab;
121     __m128           dummy_mask,cutoff_mask;
122     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
123     __m128           one     = _mm_set1_ps(1.0);
124     __m128           two     = _mm_set1_ps(2.0);
125     x                = xx[0];
126     f                = ff[0];
127
128     nri              = nlist->nri;
129     iinr             = nlist->iinr;
130     jindex           = nlist->jindex;
131     jjnr             = nlist->jjnr;
132     shiftidx         = nlist->shift;
133     gid              = nlist->gid;
134     shiftvec         = fr->shift_vec[0];
135     fshift           = fr->fshift[0];
136     facel            = _mm_set1_ps(fr->epsfac);
137     charge           = mdatoms->chargeA;
138     nvdwtype         = fr->ntype;
139     vdwparam         = fr->nbfp;
140     vdwtype          = mdatoms->typeA;
141
142     vftab            = kernel_data->table_elec_vdw->data;
143     vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
144
145     /* Setup water-specific parameters */
146     inr              = nlist->iinr[0];
147     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
148     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
149     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
150     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
151
152     jq1              = _mm_set1_ps(charge[inr+1]);
153     jq2              = _mm_set1_ps(charge[inr+2]);
154     jq3              = _mm_set1_ps(charge[inr+3]);
155     vdwjidx0A        = 2*vdwtype[inr+0];
156     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
157     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
158     qq11             = _mm_mul_ps(iq1,jq1);
159     qq12             = _mm_mul_ps(iq1,jq2);
160     qq13             = _mm_mul_ps(iq1,jq3);
161     qq21             = _mm_mul_ps(iq2,jq1);
162     qq22             = _mm_mul_ps(iq2,jq2);
163     qq23             = _mm_mul_ps(iq2,jq3);
164     qq31             = _mm_mul_ps(iq3,jq1);
165     qq32             = _mm_mul_ps(iq3,jq2);
166     qq33             = _mm_mul_ps(iq3,jq3);
167
168     /* Avoid stupid compiler warnings */
169     jnrA = jnrB = jnrC = jnrD = 0;
170     j_coord_offsetA = 0;
171     j_coord_offsetB = 0;
172     j_coord_offsetC = 0;
173     j_coord_offsetD = 0;
174
175     outeriter        = 0;
176     inneriter        = 0;
177
178     for(iidx=0;iidx<4*DIM;iidx++)
179     {
180         scratch[iidx] = 0.0;
181     }
182
183     /* Start outer loop over neighborlists */
184     for(iidx=0; iidx<nri; iidx++)
185     {
186         /* Load shift vector for this list */
187         i_shift_offset   = DIM*shiftidx[iidx];
188
189         /* Load limits for loop over neighbors */
190         j_index_start    = jindex[iidx];
191         j_index_end      = jindex[iidx+1];
192
193         /* Get outer coordinate index */
194         inr              = iinr[iidx];
195         i_coord_offset   = DIM*inr;
196
197         /* Load i particle coords and add shift vector */
198         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
199                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
200
201         fix0             = _mm_setzero_ps();
202         fiy0             = _mm_setzero_ps();
203         fiz0             = _mm_setzero_ps();
204         fix1             = _mm_setzero_ps();
205         fiy1             = _mm_setzero_ps();
206         fiz1             = _mm_setzero_ps();
207         fix2             = _mm_setzero_ps();
208         fiy2             = _mm_setzero_ps();
209         fiz2             = _mm_setzero_ps();
210         fix3             = _mm_setzero_ps();
211         fiy3             = _mm_setzero_ps();
212         fiz3             = _mm_setzero_ps();
213
214         /* Reset potential sums */
215         velecsum         = _mm_setzero_ps();
216         vvdwsum          = _mm_setzero_ps();
217
218         /* Start inner kernel loop */
219         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
220         {
221
222             /* Get j neighbor index, and coordinate index */
223             jnrA             = jjnr[jidx];
224             jnrB             = jjnr[jidx+1];
225             jnrC             = jjnr[jidx+2];
226             jnrD             = jjnr[jidx+3];
227             j_coord_offsetA  = DIM*jnrA;
228             j_coord_offsetB  = DIM*jnrB;
229             j_coord_offsetC  = DIM*jnrC;
230             j_coord_offsetD  = DIM*jnrD;
231
232             /* load j atom coordinates */
233             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
234                                               x+j_coord_offsetC,x+j_coord_offsetD,
235                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
236                                               &jy2,&jz2,&jx3,&jy3,&jz3);
237
238             /* Calculate displacement vector */
239             dx00             = _mm_sub_ps(ix0,jx0);
240             dy00             = _mm_sub_ps(iy0,jy0);
241             dz00             = _mm_sub_ps(iz0,jz0);
242             dx11             = _mm_sub_ps(ix1,jx1);
243             dy11             = _mm_sub_ps(iy1,jy1);
244             dz11             = _mm_sub_ps(iz1,jz1);
245             dx12             = _mm_sub_ps(ix1,jx2);
246             dy12             = _mm_sub_ps(iy1,jy2);
247             dz12             = _mm_sub_ps(iz1,jz2);
248             dx13             = _mm_sub_ps(ix1,jx3);
249             dy13             = _mm_sub_ps(iy1,jy3);
250             dz13             = _mm_sub_ps(iz1,jz3);
251             dx21             = _mm_sub_ps(ix2,jx1);
252             dy21             = _mm_sub_ps(iy2,jy1);
253             dz21             = _mm_sub_ps(iz2,jz1);
254             dx22             = _mm_sub_ps(ix2,jx2);
255             dy22             = _mm_sub_ps(iy2,jy2);
256             dz22             = _mm_sub_ps(iz2,jz2);
257             dx23             = _mm_sub_ps(ix2,jx3);
258             dy23             = _mm_sub_ps(iy2,jy3);
259             dz23             = _mm_sub_ps(iz2,jz3);
260             dx31             = _mm_sub_ps(ix3,jx1);
261             dy31             = _mm_sub_ps(iy3,jy1);
262             dz31             = _mm_sub_ps(iz3,jz1);
263             dx32             = _mm_sub_ps(ix3,jx2);
264             dy32             = _mm_sub_ps(iy3,jy2);
265             dz32             = _mm_sub_ps(iz3,jz2);
266             dx33             = _mm_sub_ps(ix3,jx3);
267             dy33             = _mm_sub_ps(iy3,jy3);
268             dz33             = _mm_sub_ps(iz3,jz3);
269
270             /* Calculate squared distance and things based on it */
271             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
272             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
273             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
274             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
275             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
276             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
277             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
278             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
279             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
280             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
281
282             rinv00           = gmx_mm_invsqrt_ps(rsq00);
283             rinv11           = gmx_mm_invsqrt_ps(rsq11);
284             rinv12           = gmx_mm_invsqrt_ps(rsq12);
285             rinv13           = gmx_mm_invsqrt_ps(rsq13);
286             rinv21           = gmx_mm_invsqrt_ps(rsq21);
287             rinv22           = gmx_mm_invsqrt_ps(rsq22);
288             rinv23           = gmx_mm_invsqrt_ps(rsq23);
289             rinv31           = gmx_mm_invsqrt_ps(rsq31);
290             rinv32           = gmx_mm_invsqrt_ps(rsq32);
291             rinv33           = gmx_mm_invsqrt_ps(rsq33);
292
293             fjx0             = _mm_setzero_ps();
294             fjy0             = _mm_setzero_ps();
295             fjz0             = _mm_setzero_ps();
296             fjx1             = _mm_setzero_ps();
297             fjy1             = _mm_setzero_ps();
298             fjz1             = _mm_setzero_ps();
299             fjx2             = _mm_setzero_ps();
300             fjy2             = _mm_setzero_ps();
301             fjz2             = _mm_setzero_ps();
302             fjx3             = _mm_setzero_ps();
303             fjy3             = _mm_setzero_ps();
304             fjz3             = _mm_setzero_ps();
305
306             /**************************
307              * CALCULATE INTERACTIONS *
308              **************************/
309
310             r00              = _mm_mul_ps(rsq00,rinv00);
311
312             /* Calculate table index by multiplying r with table scale and truncate to integer */
313             rt               = _mm_mul_ps(r00,vftabscale);
314             vfitab           = _mm_cvttps_epi32(rt);
315 #ifdef __XOP__
316             vfeps            = _mm_frcz_ps(rt);
317 #else
318             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
319 #endif
320             twovfeps         = _mm_add_ps(vfeps,vfeps);
321             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
322
323             /* CUBIC SPLINE TABLE DISPERSION */
324             vfitab           = _mm_add_epi32(vfitab,ifour);
325             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
326             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
327             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
328             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
329             _MM_TRANSPOSE4_PS(Y,F,G,H);
330             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
331             VV               = _mm_macc_ps(vfeps,Fp,Y);
332             vvdw6            = _mm_mul_ps(c6_00,VV);
333             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
334             fvdw6            = _mm_mul_ps(c6_00,FF);
335
336             /* CUBIC SPLINE TABLE REPULSION */
337             vfitab           = _mm_add_epi32(vfitab,ifour);
338             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
339             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
340             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
341             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
342             _MM_TRANSPOSE4_PS(Y,F,G,H);
343             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
344             VV               = _mm_macc_ps(vfeps,Fp,Y);
345             vvdw12           = _mm_mul_ps(c12_00,VV);
346             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
347             fvdw12           = _mm_mul_ps(c12_00,FF);
348             vvdw             = _mm_add_ps(vvdw12,vvdw6);
349             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
350
351             /* Update potential sum for this i atom from the interaction with this j atom. */
352             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
353
354             fscal            = fvdw;
355
356              /* Update vectorial force */
357             fix0             = _mm_macc_ps(dx00,fscal,fix0);
358             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
359             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
360
361             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
362             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
363             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
364
365             /**************************
366              * CALCULATE INTERACTIONS *
367              **************************/
368
369             r11              = _mm_mul_ps(rsq11,rinv11);
370
371             /* Calculate table index by multiplying r with table scale and truncate to integer */
372             rt               = _mm_mul_ps(r11,vftabscale);
373             vfitab           = _mm_cvttps_epi32(rt);
374 #ifdef __XOP__
375             vfeps            = _mm_frcz_ps(rt);
376 #else
377             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
378 #endif
379             twovfeps         = _mm_add_ps(vfeps,vfeps);
380             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
381
382             /* CUBIC SPLINE TABLE ELECTROSTATICS */
383             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
384             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
385             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
386             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
387             _MM_TRANSPOSE4_PS(Y,F,G,H);
388             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
389             VV               = _mm_macc_ps(vfeps,Fp,Y);
390             velec            = _mm_mul_ps(qq11,VV);
391             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
392             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
393
394             /* Update potential sum for this i atom from the interaction with this j atom. */
395             velecsum         = _mm_add_ps(velecsum,velec);
396
397             fscal            = felec;
398
399              /* Update vectorial force */
400             fix1             = _mm_macc_ps(dx11,fscal,fix1);
401             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
402             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
403
404             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
405             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
406             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
407
408             /**************************
409              * CALCULATE INTERACTIONS *
410              **************************/
411
412             r12              = _mm_mul_ps(rsq12,rinv12);
413
414             /* Calculate table index by multiplying r with table scale and truncate to integer */
415             rt               = _mm_mul_ps(r12,vftabscale);
416             vfitab           = _mm_cvttps_epi32(rt);
417 #ifdef __XOP__
418             vfeps            = _mm_frcz_ps(rt);
419 #else
420             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
421 #endif
422             twovfeps         = _mm_add_ps(vfeps,vfeps);
423             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
424
425             /* CUBIC SPLINE TABLE ELECTROSTATICS */
426             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
427             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
428             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
429             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
430             _MM_TRANSPOSE4_PS(Y,F,G,H);
431             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
432             VV               = _mm_macc_ps(vfeps,Fp,Y);
433             velec            = _mm_mul_ps(qq12,VV);
434             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
435             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
436
437             /* Update potential sum for this i atom from the interaction with this j atom. */
438             velecsum         = _mm_add_ps(velecsum,velec);
439
440             fscal            = felec;
441
442              /* Update vectorial force */
443             fix1             = _mm_macc_ps(dx12,fscal,fix1);
444             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
445             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
446
447             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
448             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
449             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
450
451             /**************************
452              * CALCULATE INTERACTIONS *
453              **************************/
454
455             r13              = _mm_mul_ps(rsq13,rinv13);
456
457             /* Calculate table index by multiplying r with table scale and truncate to integer */
458             rt               = _mm_mul_ps(r13,vftabscale);
459             vfitab           = _mm_cvttps_epi32(rt);
460 #ifdef __XOP__
461             vfeps            = _mm_frcz_ps(rt);
462 #else
463             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
464 #endif
465             twovfeps         = _mm_add_ps(vfeps,vfeps);
466             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
467
468             /* CUBIC SPLINE TABLE ELECTROSTATICS */
469             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
470             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
471             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
472             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
473             _MM_TRANSPOSE4_PS(Y,F,G,H);
474             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
475             VV               = _mm_macc_ps(vfeps,Fp,Y);
476             velec            = _mm_mul_ps(qq13,VV);
477             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
478             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
479
480             /* Update potential sum for this i atom from the interaction with this j atom. */
481             velecsum         = _mm_add_ps(velecsum,velec);
482
483             fscal            = felec;
484
485              /* Update vectorial force */
486             fix1             = _mm_macc_ps(dx13,fscal,fix1);
487             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
488             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
489
490             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
491             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
492             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
493
494             /**************************
495              * CALCULATE INTERACTIONS *
496              **************************/
497
498             r21              = _mm_mul_ps(rsq21,rinv21);
499
500             /* Calculate table index by multiplying r with table scale and truncate to integer */
501             rt               = _mm_mul_ps(r21,vftabscale);
502             vfitab           = _mm_cvttps_epi32(rt);
503 #ifdef __XOP__
504             vfeps            = _mm_frcz_ps(rt);
505 #else
506             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
507 #endif
508             twovfeps         = _mm_add_ps(vfeps,vfeps);
509             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
510
511             /* CUBIC SPLINE TABLE ELECTROSTATICS */
512             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
513             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
514             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
515             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
516             _MM_TRANSPOSE4_PS(Y,F,G,H);
517             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
518             VV               = _mm_macc_ps(vfeps,Fp,Y);
519             velec            = _mm_mul_ps(qq21,VV);
520             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
521             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
522
523             /* Update potential sum for this i atom from the interaction with this j atom. */
524             velecsum         = _mm_add_ps(velecsum,velec);
525
526             fscal            = felec;
527
528              /* Update vectorial force */
529             fix2             = _mm_macc_ps(dx21,fscal,fix2);
530             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
531             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
532
533             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
534             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
535             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
536
537             /**************************
538              * CALCULATE INTERACTIONS *
539              **************************/
540
541             r22              = _mm_mul_ps(rsq22,rinv22);
542
543             /* Calculate table index by multiplying r with table scale and truncate to integer */
544             rt               = _mm_mul_ps(r22,vftabscale);
545             vfitab           = _mm_cvttps_epi32(rt);
546 #ifdef __XOP__
547             vfeps            = _mm_frcz_ps(rt);
548 #else
549             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
550 #endif
551             twovfeps         = _mm_add_ps(vfeps,vfeps);
552             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
553
554             /* CUBIC SPLINE TABLE ELECTROSTATICS */
555             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
556             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
557             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
558             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
559             _MM_TRANSPOSE4_PS(Y,F,G,H);
560             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
561             VV               = _mm_macc_ps(vfeps,Fp,Y);
562             velec            = _mm_mul_ps(qq22,VV);
563             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
564             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
565
566             /* Update potential sum for this i atom from the interaction with this j atom. */
567             velecsum         = _mm_add_ps(velecsum,velec);
568
569             fscal            = felec;
570
571              /* Update vectorial force */
572             fix2             = _mm_macc_ps(dx22,fscal,fix2);
573             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
574             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
575
576             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
577             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
578             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
579
580             /**************************
581              * CALCULATE INTERACTIONS *
582              **************************/
583
584             r23              = _mm_mul_ps(rsq23,rinv23);
585
586             /* Calculate table index by multiplying r with table scale and truncate to integer */
587             rt               = _mm_mul_ps(r23,vftabscale);
588             vfitab           = _mm_cvttps_epi32(rt);
589 #ifdef __XOP__
590             vfeps            = _mm_frcz_ps(rt);
591 #else
592             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
593 #endif
594             twovfeps         = _mm_add_ps(vfeps,vfeps);
595             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
596
597             /* CUBIC SPLINE TABLE ELECTROSTATICS */
598             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
599             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
600             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
601             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
602             _MM_TRANSPOSE4_PS(Y,F,G,H);
603             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
604             VV               = _mm_macc_ps(vfeps,Fp,Y);
605             velec            = _mm_mul_ps(qq23,VV);
606             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
607             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
608
609             /* Update potential sum for this i atom from the interaction with this j atom. */
610             velecsum         = _mm_add_ps(velecsum,velec);
611
612             fscal            = felec;
613
614              /* Update vectorial force */
615             fix2             = _mm_macc_ps(dx23,fscal,fix2);
616             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
617             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
618
619             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
620             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
621             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
622
623             /**************************
624              * CALCULATE INTERACTIONS *
625              **************************/
626
627             r31              = _mm_mul_ps(rsq31,rinv31);
628
629             /* Calculate table index by multiplying r with table scale and truncate to integer */
630             rt               = _mm_mul_ps(r31,vftabscale);
631             vfitab           = _mm_cvttps_epi32(rt);
632 #ifdef __XOP__
633             vfeps            = _mm_frcz_ps(rt);
634 #else
635             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
636 #endif
637             twovfeps         = _mm_add_ps(vfeps,vfeps);
638             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
639
640             /* CUBIC SPLINE TABLE ELECTROSTATICS */
641             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
642             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
643             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
644             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
645             _MM_TRANSPOSE4_PS(Y,F,G,H);
646             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
647             VV               = _mm_macc_ps(vfeps,Fp,Y);
648             velec            = _mm_mul_ps(qq31,VV);
649             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
650             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
651
652             /* Update potential sum for this i atom from the interaction with this j atom. */
653             velecsum         = _mm_add_ps(velecsum,velec);
654
655             fscal            = felec;
656
657              /* Update vectorial force */
658             fix3             = _mm_macc_ps(dx31,fscal,fix3);
659             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
660             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
661
662             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
663             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
664             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
665
666             /**************************
667              * CALCULATE INTERACTIONS *
668              **************************/
669
670             r32              = _mm_mul_ps(rsq32,rinv32);
671
672             /* Calculate table index by multiplying r with table scale and truncate to integer */
673             rt               = _mm_mul_ps(r32,vftabscale);
674             vfitab           = _mm_cvttps_epi32(rt);
675 #ifdef __XOP__
676             vfeps            = _mm_frcz_ps(rt);
677 #else
678             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
679 #endif
680             twovfeps         = _mm_add_ps(vfeps,vfeps);
681             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
682
683             /* CUBIC SPLINE TABLE ELECTROSTATICS */
684             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
685             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
686             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
687             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
688             _MM_TRANSPOSE4_PS(Y,F,G,H);
689             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
690             VV               = _mm_macc_ps(vfeps,Fp,Y);
691             velec            = _mm_mul_ps(qq32,VV);
692             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
693             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
694
695             /* Update potential sum for this i atom from the interaction with this j atom. */
696             velecsum         = _mm_add_ps(velecsum,velec);
697
698             fscal            = felec;
699
700              /* Update vectorial force */
701             fix3             = _mm_macc_ps(dx32,fscal,fix3);
702             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
703             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
704
705             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
706             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
707             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
708
709             /**************************
710              * CALCULATE INTERACTIONS *
711              **************************/
712
713             r33              = _mm_mul_ps(rsq33,rinv33);
714
715             /* Calculate table index by multiplying r with table scale and truncate to integer */
716             rt               = _mm_mul_ps(r33,vftabscale);
717             vfitab           = _mm_cvttps_epi32(rt);
718 #ifdef __XOP__
719             vfeps            = _mm_frcz_ps(rt);
720 #else
721             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
722 #endif
723             twovfeps         = _mm_add_ps(vfeps,vfeps);
724             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
725
726             /* CUBIC SPLINE TABLE ELECTROSTATICS */
727             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
728             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
729             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
730             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
731             _MM_TRANSPOSE4_PS(Y,F,G,H);
732             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
733             VV               = _mm_macc_ps(vfeps,Fp,Y);
734             velec            = _mm_mul_ps(qq33,VV);
735             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
736             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
737
738             /* Update potential sum for this i atom from the interaction with this j atom. */
739             velecsum         = _mm_add_ps(velecsum,velec);
740
741             fscal            = felec;
742
743              /* Update vectorial force */
744             fix3             = _mm_macc_ps(dx33,fscal,fix3);
745             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
746             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
747
748             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
749             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
750             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
751
752             fjptrA             = f+j_coord_offsetA;
753             fjptrB             = f+j_coord_offsetB;
754             fjptrC             = f+j_coord_offsetC;
755             fjptrD             = f+j_coord_offsetD;
756
757             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
758                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
759                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
760
761             /* Inner loop uses 476 flops */
762         }
763
764         if(jidx<j_index_end)
765         {
766
767             /* Get j neighbor index, and coordinate index */
768             jnrlistA         = jjnr[jidx];
769             jnrlistB         = jjnr[jidx+1];
770             jnrlistC         = jjnr[jidx+2];
771             jnrlistD         = jjnr[jidx+3];
772             /* Sign of each element will be negative for non-real atoms.
773              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
774              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
775              */
776             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
777             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
778             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
779             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
780             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
781             j_coord_offsetA  = DIM*jnrA;
782             j_coord_offsetB  = DIM*jnrB;
783             j_coord_offsetC  = DIM*jnrC;
784             j_coord_offsetD  = DIM*jnrD;
785
786             /* load j atom coordinates */
787             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
788                                               x+j_coord_offsetC,x+j_coord_offsetD,
789                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
790                                               &jy2,&jz2,&jx3,&jy3,&jz3);
791
792             /* Calculate displacement vector */
793             dx00             = _mm_sub_ps(ix0,jx0);
794             dy00             = _mm_sub_ps(iy0,jy0);
795             dz00             = _mm_sub_ps(iz0,jz0);
796             dx11             = _mm_sub_ps(ix1,jx1);
797             dy11             = _mm_sub_ps(iy1,jy1);
798             dz11             = _mm_sub_ps(iz1,jz1);
799             dx12             = _mm_sub_ps(ix1,jx2);
800             dy12             = _mm_sub_ps(iy1,jy2);
801             dz12             = _mm_sub_ps(iz1,jz2);
802             dx13             = _mm_sub_ps(ix1,jx3);
803             dy13             = _mm_sub_ps(iy1,jy3);
804             dz13             = _mm_sub_ps(iz1,jz3);
805             dx21             = _mm_sub_ps(ix2,jx1);
806             dy21             = _mm_sub_ps(iy2,jy1);
807             dz21             = _mm_sub_ps(iz2,jz1);
808             dx22             = _mm_sub_ps(ix2,jx2);
809             dy22             = _mm_sub_ps(iy2,jy2);
810             dz22             = _mm_sub_ps(iz2,jz2);
811             dx23             = _mm_sub_ps(ix2,jx3);
812             dy23             = _mm_sub_ps(iy2,jy3);
813             dz23             = _mm_sub_ps(iz2,jz3);
814             dx31             = _mm_sub_ps(ix3,jx1);
815             dy31             = _mm_sub_ps(iy3,jy1);
816             dz31             = _mm_sub_ps(iz3,jz1);
817             dx32             = _mm_sub_ps(ix3,jx2);
818             dy32             = _mm_sub_ps(iy3,jy2);
819             dz32             = _mm_sub_ps(iz3,jz2);
820             dx33             = _mm_sub_ps(ix3,jx3);
821             dy33             = _mm_sub_ps(iy3,jy3);
822             dz33             = _mm_sub_ps(iz3,jz3);
823
824             /* Calculate squared distance and things based on it */
825             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
826             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
827             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
828             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
829             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
830             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
831             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
832             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
833             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
834             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
835
836             rinv00           = gmx_mm_invsqrt_ps(rsq00);
837             rinv11           = gmx_mm_invsqrt_ps(rsq11);
838             rinv12           = gmx_mm_invsqrt_ps(rsq12);
839             rinv13           = gmx_mm_invsqrt_ps(rsq13);
840             rinv21           = gmx_mm_invsqrt_ps(rsq21);
841             rinv22           = gmx_mm_invsqrt_ps(rsq22);
842             rinv23           = gmx_mm_invsqrt_ps(rsq23);
843             rinv31           = gmx_mm_invsqrt_ps(rsq31);
844             rinv32           = gmx_mm_invsqrt_ps(rsq32);
845             rinv33           = gmx_mm_invsqrt_ps(rsq33);
846
847             fjx0             = _mm_setzero_ps();
848             fjy0             = _mm_setzero_ps();
849             fjz0             = _mm_setzero_ps();
850             fjx1             = _mm_setzero_ps();
851             fjy1             = _mm_setzero_ps();
852             fjz1             = _mm_setzero_ps();
853             fjx2             = _mm_setzero_ps();
854             fjy2             = _mm_setzero_ps();
855             fjz2             = _mm_setzero_ps();
856             fjx3             = _mm_setzero_ps();
857             fjy3             = _mm_setzero_ps();
858             fjz3             = _mm_setzero_ps();
859
860             /**************************
861              * CALCULATE INTERACTIONS *
862              **************************/
863
864             r00              = _mm_mul_ps(rsq00,rinv00);
865             r00              = _mm_andnot_ps(dummy_mask,r00);
866
867             /* Calculate table index by multiplying r with table scale and truncate to integer */
868             rt               = _mm_mul_ps(r00,vftabscale);
869             vfitab           = _mm_cvttps_epi32(rt);
870 #ifdef __XOP__
871             vfeps            = _mm_frcz_ps(rt);
872 #else
873             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
874 #endif
875             twovfeps         = _mm_add_ps(vfeps,vfeps);
876             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
877
878             /* CUBIC SPLINE TABLE DISPERSION */
879             vfitab           = _mm_add_epi32(vfitab,ifour);
880             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
881             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
882             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
883             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
884             _MM_TRANSPOSE4_PS(Y,F,G,H);
885             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
886             VV               = _mm_macc_ps(vfeps,Fp,Y);
887             vvdw6            = _mm_mul_ps(c6_00,VV);
888             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
889             fvdw6            = _mm_mul_ps(c6_00,FF);
890
891             /* CUBIC SPLINE TABLE REPULSION */
892             vfitab           = _mm_add_epi32(vfitab,ifour);
893             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
894             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
895             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
896             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
897             _MM_TRANSPOSE4_PS(Y,F,G,H);
898             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
899             VV               = _mm_macc_ps(vfeps,Fp,Y);
900             vvdw12           = _mm_mul_ps(c12_00,VV);
901             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
902             fvdw12           = _mm_mul_ps(c12_00,FF);
903             vvdw             = _mm_add_ps(vvdw12,vvdw6);
904             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
905
906             /* Update potential sum for this i atom from the interaction with this j atom. */
907             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
908             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
909
910             fscal            = fvdw;
911
912             fscal            = _mm_andnot_ps(dummy_mask,fscal);
913
914              /* Update vectorial force */
915             fix0             = _mm_macc_ps(dx00,fscal,fix0);
916             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
917             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
918
919             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
920             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
921             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
922
923             /**************************
924              * CALCULATE INTERACTIONS *
925              **************************/
926
927             r11              = _mm_mul_ps(rsq11,rinv11);
928             r11              = _mm_andnot_ps(dummy_mask,r11);
929
930             /* Calculate table index by multiplying r with table scale and truncate to integer */
931             rt               = _mm_mul_ps(r11,vftabscale);
932             vfitab           = _mm_cvttps_epi32(rt);
933 #ifdef __XOP__
934             vfeps            = _mm_frcz_ps(rt);
935 #else
936             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
937 #endif
938             twovfeps         = _mm_add_ps(vfeps,vfeps);
939             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
940
941             /* CUBIC SPLINE TABLE ELECTROSTATICS */
942             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
943             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
944             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
945             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
946             _MM_TRANSPOSE4_PS(Y,F,G,H);
947             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
948             VV               = _mm_macc_ps(vfeps,Fp,Y);
949             velec            = _mm_mul_ps(qq11,VV);
950             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
951             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
952
953             /* Update potential sum for this i atom from the interaction with this j atom. */
954             velec            = _mm_andnot_ps(dummy_mask,velec);
955             velecsum         = _mm_add_ps(velecsum,velec);
956
957             fscal            = felec;
958
959             fscal            = _mm_andnot_ps(dummy_mask,fscal);
960
961              /* Update vectorial force */
962             fix1             = _mm_macc_ps(dx11,fscal,fix1);
963             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
964             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
965
966             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
967             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
968             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
969
970             /**************************
971              * CALCULATE INTERACTIONS *
972              **************************/
973
974             r12              = _mm_mul_ps(rsq12,rinv12);
975             r12              = _mm_andnot_ps(dummy_mask,r12);
976
977             /* Calculate table index by multiplying r with table scale and truncate to integer */
978             rt               = _mm_mul_ps(r12,vftabscale);
979             vfitab           = _mm_cvttps_epi32(rt);
980 #ifdef __XOP__
981             vfeps            = _mm_frcz_ps(rt);
982 #else
983             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
984 #endif
985             twovfeps         = _mm_add_ps(vfeps,vfeps);
986             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
987
988             /* CUBIC SPLINE TABLE ELECTROSTATICS */
989             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
990             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
991             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
992             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
993             _MM_TRANSPOSE4_PS(Y,F,G,H);
994             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
995             VV               = _mm_macc_ps(vfeps,Fp,Y);
996             velec            = _mm_mul_ps(qq12,VV);
997             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
998             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
999
1000             /* Update potential sum for this i atom from the interaction with this j atom. */
1001             velec            = _mm_andnot_ps(dummy_mask,velec);
1002             velecsum         = _mm_add_ps(velecsum,velec);
1003
1004             fscal            = felec;
1005
1006             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1007
1008              /* Update vectorial force */
1009             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1010             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1011             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1012
1013             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1014             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1015             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1016
1017             /**************************
1018              * CALCULATE INTERACTIONS *
1019              **************************/
1020
1021             r13              = _mm_mul_ps(rsq13,rinv13);
1022             r13              = _mm_andnot_ps(dummy_mask,r13);
1023
1024             /* Calculate table index by multiplying r with table scale and truncate to integer */
1025             rt               = _mm_mul_ps(r13,vftabscale);
1026             vfitab           = _mm_cvttps_epi32(rt);
1027 #ifdef __XOP__
1028             vfeps            = _mm_frcz_ps(rt);
1029 #else
1030             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1031 #endif
1032             twovfeps         = _mm_add_ps(vfeps,vfeps);
1033             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1034
1035             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1036             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1037             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1038             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1039             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1040             _MM_TRANSPOSE4_PS(Y,F,G,H);
1041             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1042             VV               = _mm_macc_ps(vfeps,Fp,Y);
1043             velec            = _mm_mul_ps(qq13,VV);
1044             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1045             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1046
1047             /* Update potential sum for this i atom from the interaction with this j atom. */
1048             velec            = _mm_andnot_ps(dummy_mask,velec);
1049             velecsum         = _mm_add_ps(velecsum,velec);
1050
1051             fscal            = felec;
1052
1053             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1054
1055              /* Update vectorial force */
1056             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1057             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1058             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1059
1060             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1061             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1062             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1063
1064             /**************************
1065              * CALCULATE INTERACTIONS *
1066              **************************/
1067
1068             r21              = _mm_mul_ps(rsq21,rinv21);
1069             r21              = _mm_andnot_ps(dummy_mask,r21);
1070
1071             /* Calculate table index by multiplying r with table scale and truncate to integer */
1072             rt               = _mm_mul_ps(r21,vftabscale);
1073             vfitab           = _mm_cvttps_epi32(rt);
1074 #ifdef __XOP__
1075             vfeps            = _mm_frcz_ps(rt);
1076 #else
1077             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1078 #endif
1079             twovfeps         = _mm_add_ps(vfeps,vfeps);
1080             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1081
1082             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1083             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1084             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1085             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1086             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1087             _MM_TRANSPOSE4_PS(Y,F,G,H);
1088             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1089             VV               = _mm_macc_ps(vfeps,Fp,Y);
1090             velec            = _mm_mul_ps(qq21,VV);
1091             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1092             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1093
1094             /* Update potential sum for this i atom from the interaction with this j atom. */
1095             velec            = _mm_andnot_ps(dummy_mask,velec);
1096             velecsum         = _mm_add_ps(velecsum,velec);
1097
1098             fscal            = felec;
1099
1100             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1101
1102              /* Update vectorial force */
1103             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1104             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1105             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1106
1107             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1108             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1109             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1110
1111             /**************************
1112              * CALCULATE INTERACTIONS *
1113              **************************/
1114
1115             r22              = _mm_mul_ps(rsq22,rinv22);
1116             r22              = _mm_andnot_ps(dummy_mask,r22);
1117
1118             /* Calculate table index by multiplying r with table scale and truncate to integer */
1119             rt               = _mm_mul_ps(r22,vftabscale);
1120             vfitab           = _mm_cvttps_epi32(rt);
1121 #ifdef __XOP__
1122             vfeps            = _mm_frcz_ps(rt);
1123 #else
1124             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1125 #endif
1126             twovfeps         = _mm_add_ps(vfeps,vfeps);
1127             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1128
1129             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1130             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1131             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1132             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1133             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1134             _MM_TRANSPOSE4_PS(Y,F,G,H);
1135             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1136             VV               = _mm_macc_ps(vfeps,Fp,Y);
1137             velec            = _mm_mul_ps(qq22,VV);
1138             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1139             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1140
1141             /* Update potential sum for this i atom from the interaction with this j atom. */
1142             velec            = _mm_andnot_ps(dummy_mask,velec);
1143             velecsum         = _mm_add_ps(velecsum,velec);
1144
1145             fscal            = felec;
1146
1147             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1148
1149              /* Update vectorial force */
1150             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1151             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1152             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1153
1154             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1155             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1156             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1157
1158             /**************************
1159              * CALCULATE INTERACTIONS *
1160              **************************/
1161
1162             r23              = _mm_mul_ps(rsq23,rinv23);
1163             r23              = _mm_andnot_ps(dummy_mask,r23);
1164
1165             /* Calculate table index by multiplying r with table scale and truncate to integer */
1166             rt               = _mm_mul_ps(r23,vftabscale);
1167             vfitab           = _mm_cvttps_epi32(rt);
1168 #ifdef __XOP__
1169             vfeps            = _mm_frcz_ps(rt);
1170 #else
1171             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1172 #endif
1173             twovfeps         = _mm_add_ps(vfeps,vfeps);
1174             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1175
1176             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1177             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1178             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1179             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1180             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1181             _MM_TRANSPOSE4_PS(Y,F,G,H);
1182             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1183             VV               = _mm_macc_ps(vfeps,Fp,Y);
1184             velec            = _mm_mul_ps(qq23,VV);
1185             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1186             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1187
1188             /* Update potential sum for this i atom from the interaction with this j atom. */
1189             velec            = _mm_andnot_ps(dummy_mask,velec);
1190             velecsum         = _mm_add_ps(velecsum,velec);
1191
1192             fscal            = felec;
1193
1194             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1195
1196              /* Update vectorial force */
1197             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1198             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1199             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1200
1201             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1202             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1203             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1204
1205             /**************************
1206              * CALCULATE INTERACTIONS *
1207              **************************/
1208
1209             r31              = _mm_mul_ps(rsq31,rinv31);
1210             r31              = _mm_andnot_ps(dummy_mask,r31);
1211
1212             /* Calculate table index by multiplying r with table scale and truncate to integer */
1213             rt               = _mm_mul_ps(r31,vftabscale);
1214             vfitab           = _mm_cvttps_epi32(rt);
1215 #ifdef __XOP__
1216             vfeps            = _mm_frcz_ps(rt);
1217 #else
1218             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1219 #endif
1220             twovfeps         = _mm_add_ps(vfeps,vfeps);
1221             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1222
1223             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1224             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1225             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1226             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1227             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1228             _MM_TRANSPOSE4_PS(Y,F,G,H);
1229             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1230             VV               = _mm_macc_ps(vfeps,Fp,Y);
1231             velec            = _mm_mul_ps(qq31,VV);
1232             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1233             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1234
1235             /* Update potential sum for this i atom from the interaction with this j atom. */
1236             velec            = _mm_andnot_ps(dummy_mask,velec);
1237             velecsum         = _mm_add_ps(velecsum,velec);
1238
1239             fscal            = felec;
1240
1241             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1242
1243              /* Update vectorial force */
1244             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1245             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1246             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1247
1248             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1249             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1250             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1251
1252             /**************************
1253              * CALCULATE INTERACTIONS *
1254              **************************/
1255
1256             r32              = _mm_mul_ps(rsq32,rinv32);
1257             r32              = _mm_andnot_ps(dummy_mask,r32);
1258
1259             /* Calculate table index by multiplying r with table scale and truncate to integer */
1260             rt               = _mm_mul_ps(r32,vftabscale);
1261             vfitab           = _mm_cvttps_epi32(rt);
1262 #ifdef __XOP__
1263             vfeps            = _mm_frcz_ps(rt);
1264 #else
1265             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1266 #endif
1267             twovfeps         = _mm_add_ps(vfeps,vfeps);
1268             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1269
1270             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1271             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1272             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1273             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1274             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1275             _MM_TRANSPOSE4_PS(Y,F,G,H);
1276             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1277             VV               = _mm_macc_ps(vfeps,Fp,Y);
1278             velec            = _mm_mul_ps(qq32,VV);
1279             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1280             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1281
1282             /* Update potential sum for this i atom from the interaction with this j atom. */
1283             velec            = _mm_andnot_ps(dummy_mask,velec);
1284             velecsum         = _mm_add_ps(velecsum,velec);
1285
1286             fscal            = felec;
1287
1288             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1289
1290              /* Update vectorial force */
1291             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1292             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1293             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1294
1295             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1296             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1297             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1298
1299             /**************************
1300              * CALCULATE INTERACTIONS *
1301              **************************/
1302
1303             r33              = _mm_mul_ps(rsq33,rinv33);
1304             r33              = _mm_andnot_ps(dummy_mask,r33);
1305
1306             /* Calculate table index by multiplying r with table scale and truncate to integer */
1307             rt               = _mm_mul_ps(r33,vftabscale);
1308             vfitab           = _mm_cvttps_epi32(rt);
1309 #ifdef __XOP__
1310             vfeps            = _mm_frcz_ps(rt);
1311 #else
1312             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1313 #endif
1314             twovfeps         = _mm_add_ps(vfeps,vfeps);
1315             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1316
1317             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1318             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1319             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1320             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1321             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1322             _MM_TRANSPOSE4_PS(Y,F,G,H);
1323             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1324             VV               = _mm_macc_ps(vfeps,Fp,Y);
1325             velec            = _mm_mul_ps(qq33,VV);
1326             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1327             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1328
1329             /* Update potential sum for this i atom from the interaction with this j atom. */
1330             velec            = _mm_andnot_ps(dummy_mask,velec);
1331             velecsum         = _mm_add_ps(velecsum,velec);
1332
1333             fscal            = felec;
1334
1335             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1336
1337              /* Update vectorial force */
1338             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1339             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1340             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1341
1342             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1343             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1344             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1345
1346             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1347             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1348             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1349             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1350
1351             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1352                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1353                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1354
1355             /* Inner loop uses 486 flops */
1356         }
1357
1358         /* End of innermost loop */
1359
1360         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1361                                               f+i_coord_offset,fshift+i_shift_offset);
1362
1363         ggid                        = gid[iidx];
1364         /* Update potential energies */
1365         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1366         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1367
1368         /* Increment number of inner iterations */
1369         inneriter                  += j_index_end - j_index_start;
1370
1371         /* Outer loop uses 26 flops */
1372     }
1373
1374     /* Increment number of outer iterations */
1375     outeriter        += nri;
1376
1377     /* Update outer/inner flops */
1378
1379     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*486);
1380 }
1381 /*
1382  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_single
1383  * Electrostatics interaction: CubicSplineTable
1384  * VdW interaction:            CubicSplineTable
1385  * Geometry:                   Water4-Water4
1386  * Calculate force/pot:        Force
1387  */
1388 void
1389 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_single
1390                     (t_nblist                    * gmx_restrict       nlist,
1391                      rvec                        * gmx_restrict          xx,
1392                      rvec                        * gmx_restrict          ff,
1393                      t_forcerec                  * gmx_restrict          fr,
1394                      t_mdatoms                   * gmx_restrict     mdatoms,
1395                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1396                      t_nrnb                      * gmx_restrict        nrnb)
1397 {
1398     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1399      * just 0 for non-waters.
1400      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1401      * jnr indices corresponding to data put in the four positions in the SIMD register.
1402      */
1403     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1404     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1405     int              jnrA,jnrB,jnrC,jnrD;
1406     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1407     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1408     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1409     real             rcutoff_scalar;
1410     real             *shiftvec,*fshift,*x,*f;
1411     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1412     real             scratch[4*DIM];
1413     __m128           fscal,rcutoff,rcutoff2,jidxall;
1414     int              vdwioffset0;
1415     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1416     int              vdwioffset1;
1417     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1418     int              vdwioffset2;
1419     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1420     int              vdwioffset3;
1421     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1422     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1423     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1424     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1425     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1426     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1427     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1428     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1429     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1430     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1431     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1432     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1433     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1434     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1435     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1436     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1437     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1438     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1439     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1440     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1441     real             *charge;
1442     int              nvdwtype;
1443     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1444     int              *vdwtype;
1445     real             *vdwparam;
1446     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1447     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1448     __m128i          vfitab;
1449     __m128i          ifour       = _mm_set1_epi32(4);
1450     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1451     real             *vftab;
1452     __m128           dummy_mask,cutoff_mask;
1453     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1454     __m128           one     = _mm_set1_ps(1.0);
1455     __m128           two     = _mm_set1_ps(2.0);
1456     x                = xx[0];
1457     f                = ff[0];
1458
1459     nri              = nlist->nri;
1460     iinr             = nlist->iinr;
1461     jindex           = nlist->jindex;
1462     jjnr             = nlist->jjnr;
1463     shiftidx         = nlist->shift;
1464     gid              = nlist->gid;
1465     shiftvec         = fr->shift_vec[0];
1466     fshift           = fr->fshift[0];
1467     facel            = _mm_set1_ps(fr->epsfac);
1468     charge           = mdatoms->chargeA;
1469     nvdwtype         = fr->ntype;
1470     vdwparam         = fr->nbfp;
1471     vdwtype          = mdatoms->typeA;
1472
1473     vftab            = kernel_data->table_elec_vdw->data;
1474     vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
1475
1476     /* Setup water-specific parameters */
1477     inr              = nlist->iinr[0];
1478     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1479     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1480     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1481     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1482
1483     jq1              = _mm_set1_ps(charge[inr+1]);
1484     jq2              = _mm_set1_ps(charge[inr+2]);
1485     jq3              = _mm_set1_ps(charge[inr+3]);
1486     vdwjidx0A        = 2*vdwtype[inr+0];
1487     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1488     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1489     qq11             = _mm_mul_ps(iq1,jq1);
1490     qq12             = _mm_mul_ps(iq1,jq2);
1491     qq13             = _mm_mul_ps(iq1,jq3);
1492     qq21             = _mm_mul_ps(iq2,jq1);
1493     qq22             = _mm_mul_ps(iq2,jq2);
1494     qq23             = _mm_mul_ps(iq2,jq3);
1495     qq31             = _mm_mul_ps(iq3,jq1);
1496     qq32             = _mm_mul_ps(iq3,jq2);
1497     qq33             = _mm_mul_ps(iq3,jq3);
1498
1499     /* Avoid stupid compiler warnings */
1500     jnrA = jnrB = jnrC = jnrD = 0;
1501     j_coord_offsetA = 0;
1502     j_coord_offsetB = 0;
1503     j_coord_offsetC = 0;
1504     j_coord_offsetD = 0;
1505
1506     outeriter        = 0;
1507     inneriter        = 0;
1508
1509     for(iidx=0;iidx<4*DIM;iidx++)
1510     {
1511         scratch[iidx] = 0.0;
1512     }
1513
1514     /* Start outer loop over neighborlists */
1515     for(iidx=0; iidx<nri; iidx++)
1516     {
1517         /* Load shift vector for this list */
1518         i_shift_offset   = DIM*shiftidx[iidx];
1519
1520         /* Load limits for loop over neighbors */
1521         j_index_start    = jindex[iidx];
1522         j_index_end      = jindex[iidx+1];
1523
1524         /* Get outer coordinate index */
1525         inr              = iinr[iidx];
1526         i_coord_offset   = DIM*inr;
1527
1528         /* Load i particle coords and add shift vector */
1529         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1530                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1531
1532         fix0             = _mm_setzero_ps();
1533         fiy0             = _mm_setzero_ps();
1534         fiz0             = _mm_setzero_ps();
1535         fix1             = _mm_setzero_ps();
1536         fiy1             = _mm_setzero_ps();
1537         fiz1             = _mm_setzero_ps();
1538         fix2             = _mm_setzero_ps();
1539         fiy2             = _mm_setzero_ps();
1540         fiz2             = _mm_setzero_ps();
1541         fix3             = _mm_setzero_ps();
1542         fiy3             = _mm_setzero_ps();
1543         fiz3             = _mm_setzero_ps();
1544
1545         /* Start inner kernel loop */
1546         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1547         {
1548
1549             /* Get j neighbor index, and coordinate index */
1550             jnrA             = jjnr[jidx];
1551             jnrB             = jjnr[jidx+1];
1552             jnrC             = jjnr[jidx+2];
1553             jnrD             = jjnr[jidx+3];
1554             j_coord_offsetA  = DIM*jnrA;
1555             j_coord_offsetB  = DIM*jnrB;
1556             j_coord_offsetC  = DIM*jnrC;
1557             j_coord_offsetD  = DIM*jnrD;
1558
1559             /* load j atom coordinates */
1560             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1561                                               x+j_coord_offsetC,x+j_coord_offsetD,
1562                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1563                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1564
1565             /* Calculate displacement vector */
1566             dx00             = _mm_sub_ps(ix0,jx0);
1567             dy00             = _mm_sub_ps(iy0,jy0);
1568             dz00             = _mm_sub_ps(iz0,jz0);
1569             dx11             = _mm_sub_ps(ix1,jx1);
1570             dy11             = _mm_sub_ps(iy1,jy1);
1571             dz11             = _mm_sub_ps(iz1,jz1);
1572             dx12             = _mm_sub_ps(ix1,jx2);
1573             dy12             = _mm_sub_ps(iy1,jy2);
1574             dz12             = _mm_sub_ps(iz1,jz2);
1575             dx13             = _mm_sub_ps(ix1,jx3);
1576             dy13             = _mm_sub_ps(iy1,jy3);
1577             dz13             = _mm_sub_ps(iz1,jz3);
1578             dx21             = _mm_sub_ps(ix2,jx1);
1579             dy21             = _mm_sub_ps(iy2,jy1);
1580             dz21             = _mm_sub_ps(iz2,jz1);
1581             dx22             = _mm_sub_ps(ix2,jx2);
1582             dy22             = _mm_sub_ps(iy2,jy2);
1583             dz22             = _mm_sub_ps(iz2,jz2);
1584             dx23             = _mm_sub_ps(ix2,jx3);
1585             dy23             = _mm_sub_ps(iy2,jy3);
1586             dz23             = _mm_sub_ps(iz2,jz3);
1587             dx31             = _mm_sub_ps(ix3,jx1);
1588             dy31             = _mm_sub_ps(iy3,jy1);
1589             dz31             = _mm_sub_ps(iz3,jz1);
1590             dx32             = _mm_sub_ps(ix3,jx2);
1591             dy32             = _mm_sub_ps(iy3,jy2);
1592             dz32             = _mm_sub_ps(iz3,jz2);
1593             dx33             = _mm_sub_ps(ix3,jx3);
1594             dy33             = _mm_sub_ps(iy3,jy3);
1595             dz33             = _mm_sub_ps(iz3,jz3);
1596
1597             /* Calculate squared distance and things based on it */
1598             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1599             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1600             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1601             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1602             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1603             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1604             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1605             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1606             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1607             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1608
1609             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1610             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1611             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1612             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1613             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1614             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1615             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1616             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1617             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1618             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1619
1620             fjx0             = _mm_setzero_ps();
1621             fjy0             = _mm_setzero_ps();
1622             fjz0             = _mm_setzero_ps();
1623             fjx1             = _mm_setzero_ps();
1624             fjy1             = _mm_setzero_ps();
1625             fjz1             = _mm_setzero_ps();
1626             fjx2             = _mm_setzero_ps();
1627             fjy2             = _mm_setzero_ps();
1628             fjz2             = _mm_setzero_ps();
1629             fjx3             = _mm_setzero_ps();
1630             fjy3             = _mm_setzero_ps();
1631             fjz3             = _mm_setzero_ps();
1632
1633             /**************************
1634              * CALCULATE INTERACTIONS *
1635              **************************/
1636
1637             r00              = _mm_mul_ps(rsq00,rinv00);
1638
1639             /* Calculate table index by multiplying r with table scale and truncate to integer */
1640             rt               = _mm_mul_ps(r00,vftabscale);
1641             vfitab           = _mm_cvttps_epi32(rt);
1642 #ifdef __XOP__
1643             vfeps            = _mm_frcz_ps(rt);
1644 #else
1645             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1646 #endif
1647             twovfeps         = _mm_add_ps(vfeps,vfeps);
1648             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1649
1650             /* CUBIC SPLINE TABLE DISPERSION */
1651             vfitab           = _mm_add_epi32(vfitab,ifour);
1652             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1653             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1654             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1655             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1656             _MM_TRANSPOSE4_PS(Y,F,G,H);
1657             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1658             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1659             fvdw6            = _mm_mul_ps(c6_00,FF);
1660
1661             /* CUBIC SPLINE TABLE REPULSION */
1662             vfitab           = _mm_add_epi32(vfitab,ifour);
1663             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1664             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1665             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1666             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1667             _MM_TRANSPOSE4_PS(Y,F,G,H);
1668             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1669             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1670             fvdw12           = _mm_mul_ps(c12_00,FF);
1671             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1672
1673             fscal            = fvdw;
1674
1675              /* Update vectorial force */
1676             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1677             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1678             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1679
1680             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1681             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1682             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1683
1684             /**************************
1685              * CALCULATE INTERACTIONS *
1686              **************************/
1687
1688             r11              = _mm_mul_ps(rsq11,rinv11);
1689
1690             /* Calculate table index by multiplying r with table scale and truncate to integer */
1691             rt               = _mm_mul_ps(r11,vftabscale);
1692             vfitab           = _mm_cvttps_epi32(rt);
1693 #ifdef __XOP__
1694             vfeps            = _mm_frcz_ps(rt);
1695 #else
1696             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1697 #endif
1698             twovfeps         = _mm_add_ps(vfeps,vfeps);
1699             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1700
1701             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1702             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1703             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1704             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1705             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1706             _MM_TRANSPOSE4_PS(Y,F,G,H);
1707             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1708             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1709             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1710
1711             fscal            = felec;
1712
1713              /* Update vectorial force */
1714             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1715             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1716             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1717
1718             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1719             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1720             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1721
1722             /**************************
1723              * CALCULATE INTERACTIONS *
1724              **************************/
1725
1726             r12              = _mm_mul_ps(rsq12,rinv12);
1727
1728             /* Calculate table index by multiplying r with table scale and truncate to integer */
1729             rt               = _mm_mul_ps(r12,vftabscale);
1730             vfitab           = _mm_cvttps_epi32(rt);
1731 #ifdef __XOP__
1732             vfeps            = _mm_frcz_ps(rt);
1733 #else
1734             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1735 #endif
1736             twovfeps         = _mm_add_ps(vfeps,vfeps);
1737             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1738
1739             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1740             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1741             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1742             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1743             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1744             _MM_TRANSPOSE4_PS(Y,F,G,H);
1745             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1746             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1747             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1748
1749             fscal            = felec;
1750
1751              /* Update vectorial force */
1752             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1753             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1754             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1755
1756             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1757             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1758             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1759
1760             /**************************
1761              * CALCULATE INTERACTIONS *
1762              **************************/
1763
1764             r13              = _mm_mul_ps(rsq13,rinv13);
1765
1766             /* Calculate table index by multiplying r with table scale and truncate to integer */
1767             rt               = _mm_mul_ps(r13,vftabscale);
1768             vfitab           = _mm_cvttps_epi32(rt);
1769 #ifdef __XOP__
1770             vfeps            = _mm_frcz_ps(rt);
1771 #else
1772             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1773 #endif
1774             twovfeps         = _mm_add_ps(vfeps,vfeps);
1775             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1776
1777             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1778             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1779             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1780             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1781             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1782             _MM_TRANSPOSE4_PS(Y,F,G,H);
1783             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1784             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1785             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1786
1787             fscal            = felec;
1788
1789              /* Update vectorial force */
1790             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1791             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1792             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1793
1794             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1795             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1796             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1797
1798             /**************************
1799              * CALCULATE INTERACTIONS *
1800              **************************/
1801
1802             r21              = _mm_mul_ps(rsq21,rinv21);
1803
1804             /* Calculate table index by multiplying r with table scale and truncate to integer */
1805             rt               = _mm_mul_ps(r21,vftabscale);
1806             vfitab           = _mm_cvttps_epi32(rt);
1807 #ifdef __XOP__
1808             vfeps            = _mm_frcz_ps(rt);
1809 #else
1810             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1811 #endif
1812             twovfeps         = _mm_add_ps(vfeps,vfeps);
1813             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1814
1815             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1816             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1817             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1818             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1819             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1820             _MM_TRANSPOSE4_PS(Y,F,G,H);
1821             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1822             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1823             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1824
1825             fscal            = felec;
1826
1827              /* Update vectorial force */
1828             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1829             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1830             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1831
1832             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1833             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1834             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1835
1836             /**************************
1837              * CALCULATE INTERACTIONS *
1838              **************************/
1839
1840             r22              = _mm_mul_ps(rsq22,rinv22);
1841
1842             /* Calculate table index by multiplying r with table scale and truncate to integer */
1843             rt               = _mm_mul_ps(r22,vftabscale);
1844             vfitab           = _mm_cvttps_epi32(rt);
1845 #ifdef __XOP__
1846             vfeps            = _mm_frcz_ps(rt);
1847 #else
1848             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1849 #endif
1850             twovfeps         = _mm_add_ps(vfeps,vfeps);
1851             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1852
1853             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1854             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1855             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1856             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1857             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1858             _MM_TRANSPOSE4_PS(Y,F,G,H);
1859             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1860             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1861             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1862
1863             fscal            = felec;
1864
1865              /* Update vectorial force */
1866             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1867             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1868             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1869
1870             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1871             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1872             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1873
1874             /**************************
1875              * CALCULATE INTERACTIONS *
1876              **************************/
1877
1878             r23              = _mm_mul_ps(rsq23,rinv23);
1879
1880             /* Calculate table index by multiplying r with table scale and truncate to integer */
1881             rt               = _mm_mul_ps(r23,vftabscale);
1882             vfitab           = _mm_cvttps_epi32(rt);
1883 #ifdef __XOP__
1884             vfeps            = _mm_frcz_ps(rt);
1885 #else
1886             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1887 #endif
1888             twovfeps         = _mm_add_ps(vfeps,vfeps);
1889             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1890
1891             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1892             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1893             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1894             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1895             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1896             _MM_TRANSPOSE4_PS(Y,F,G,H);
1897             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1898             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1899             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1900
1901             fscal            = felec;
1902
1903              /* Update vectorial force */
1904             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1905             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1906             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1907
1908             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1909             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1910             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1911
1912             /**************************
1913              * CALCULATE INTERACTIONS *
1914              **************************/
1915
1916             r31              = _mm_mul_ps(rsq31,rinv31);
1917
1918             /* Calculate table index by multiplying r with table scale and truncate to integer */
1919             rt               = _mm_mul_ps(r31,vftabscale);
1920             vfitab           = _mm_cvttps_epi32(rt);
1921 #ifdef __XOP__
1922             vfeps            = _mm_frcz_ps(rt);
1923 #else
1924             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1925 #endif
1926             twovfeps         = _mm_add_ps(vfeps,vfeps);
1927             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1928
1929             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1930             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1931             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1932             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1933             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1934             _MM_TRANSPOSE4_PS(Y,F,G,H);
1935             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1936             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1937             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1938
1939             fscal            = felec;
1940
1941              /* Update vectorial force */
1942             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1943             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1944             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1945
1946             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1947             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1948             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1949
1950             /**************************
1951              * CALCULATE INTERACTIONS *
1952              **************************/
1953
1954             r32              = _mm_mul_ps(rsq32,rinv32);
1955
1956             /* Calculate table index by multiplying r with table scale and truncate to integer */
1957             rt               = _mm_mul_ps(r32,vftabscale);
1958             vfitab           = _mm_cvttps_epi32(rt);
1959 #ifdef __XOP__
1960             vfeps            = _mm_frcz_ps(rt);
1961 #else
1962             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1963 #endif
1964             twovfeps         = _mm_add_ps(vfeps,vfeps);
1965             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1966
1967             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1968             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1969             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1970             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1971             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1972             _MM_TRANSPOSE4_PS(Y,F,G,H);
1973             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1974             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1975             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1976
1977             fscal            = felec;
1978
1979              /* Update vectorial force */
1980             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1981             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1982             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1983
1984             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1985             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1986             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1987
1988             /**************************
1989              * CALCULATE INTERACTIONS *
1990              **************************/
1991
1992             r33              = _mm_mul_ps(rsq33,rinv33);
1993
1994             /* Calculate table index by multiplying r with table scale and truncate to integer */
1995             rt               = _mm_mul_ps(r33,vftabscale);
1996             vfitab           = _mm_cvttps_epi32(rt);
1997 #ifdef __XOP__
1998             vfeps            = _mm_frcz_ps(rt);
1999 #else
2000             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2001 #endif
2002             twovfeps         = _mm_add_ps(vfeps,vfeps);
2003             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2004
2005             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2006             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2007             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2008             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2009             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2010             _MM_TRANSPOSE4_PS(Y,F,G,H);
2011             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2012             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2013             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2014
2015             fscal            = felec;
2016
2017              /* Update vectorial force */
2018             fix3             = _mm_macc_ps(dx33,fscal,fix3);
2019             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
2020             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
2021
2022             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
2023             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
2024             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
2025
2026             fjptrA             = f+j_coord_offsetA;
2027             fjptrB             = f+j_coord_offsetB;
2028             fjptrC             = f+j_coord_offsetC;
2029             fjptrD             = f+j_coord_offsetD;
2030
2031             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2032                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2033                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2034
2035             /* Inner loop uses 432 flops */
2036         }
2037
2038         if(jidx<j_index_end)
2039         {
2040
2041             /* Get j neighbor index, and coordinate index */
2042             jnrlistA         = jjnr[jidx];
2043             jnrlistB         = jjnr[jidx+1];
2044             jnrlistC         = jjnr[jidx+2];
2045             jnrlistD         = jjnr[jidx+3];
2046             /* Sign of each element will be negative for non-real atoms.
2047              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2048              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2049              */
2050             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2051             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2052             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2053             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2054             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2055             j_coord_offsetA  = DIM*jnrA;
2056             j_coord_offsetB  = DIM*jnrB;
2057             j_coord_offsetC  = DIM*jnrC;
2058             j_coord_offsetD  = DIM*jnrD;
2059
2060             /* load j atom coordinates */
2061             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2062                                               x+j_coord_offsetC,x+j_coord_offsetD,
2063                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2064                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2065
2066             /* Calculate displacement vector */
2067             dx00             = _mm_sub_ps(ix0,jx0);
2068             dy00             = _mm_sub_ps(iy0,jy0);
2069             dz00             = _mm_sub_ps(iz0,jz0);
2070             dx11             = _mm_sub_ps(ix1,jx1);
2071             dy11             = _mm_sub_ps(iy1,jy1);
2072             dz11             = _mm_sub_ps(iz1,jz1);
2073             dx12             = _mm_sub_ps(ix1,jx2);
2074             dy12             = _mm_sub_ps(iy1,jy2);
2075             dz12             = _mm_sub_ps(iz1,jz2);
2076             dx13             = _mm_sub_ps(ix1,jx3);
2077             dy13             = _mm_sub_ps(iy1,jy3);
2078             dz13             = _mm_sub_ps(iz1,jz3);
2079             dx21             = _mm_sub_ps(ix2,jx1);
2080             dy21             = _mm_sub_ps(iy2,jy1);
2081             dz21             = _mm_sub_ps(iz2,jz1);
2082             dx22             = _mm_sub_ps(ix2,jx2);
2083             dy22             = _mm_sub_ps(iy2,jy2);
2084             dz22             = _mm_sub_ps(iz2,jz2);
2085             dx23             = _mm_sub_ps(ix2,jx3);
2086             dy23             = _mm_sub_ps(iy2,jy3);
2087             dz23             = _mm_sub_ps(iz2,jz3);
2088             dx31             = _mm_sub_ps(ix3,jx1);
2089             dy31             = _mm_sub_ps(iy3,jy1);
2090             dz31             = _mm_sub_ps(iz3,jz1);
2091             dx32             = _mm_sub_ps(ix3,jx2);
2092             dy32             = _mm_sub_ps(iy3,jy2);
2093             dz32             = _mm_sub_ps(iz3,jz2);
2094             dx33             = _mm_sub_ps(ix3,jx3);
2095             dy33             = _mm_sub_ps(iy3,jy3);
2096             dz33             = _mm_sub_ps(iz3,jz3);
2097
2098             /* Calculate squared distance and things based on it */
2099             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2100             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2101             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2102             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2103             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2104             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2105             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2106             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2107             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2108             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2109
2110             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2111             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2112             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2113             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2114             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2115             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2116             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2117             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2118             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2119             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2120
2121             fjx0             = _mm_setzero_ps();
2122             fjy0             = _mm_setzero_ps();
2123             fjz0             = _mm_setzero_ps();
2124             fjx1             = _mm_setzero_ps();
2125             fjy1             = _mm_setzero_ps();
2126             fjz1             = _mm_setzero_ps();
2127             fjx2             = _mm_setzero_ps();
2128             fjy2             = _mm_setzero_ps();
2129             fjz2             = _mm_setzero_ps();
2130             fjx3             = _mm_setzero_ps();
2131             fjy3             = _mm_setzero_ps();
2132             fjz3             = _mm_setzero_ps();
2133
2134             /**************************
2135              * CALCULATE INTERACTIONS *
2136              **************************/
2137
2138             r00              = _mm_mul_ps(rsq00,rinv00);
2139             r00              = _mm_andnot_ps(dummy_mask,r00);
2140
2141             /* Calculate table index by multiplying r with table scale and truncate to integer */
2142             rt               = _mm_mul_ps(r00,vftabscale);
2143             vfitab           = _mm_cvttps_epi32(rt);
2144 #ifdef __XOP__
2145             vfeps            = _mm_frcz_ps(rt);
2146 #else
2147             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2148 #endif
2149             twovfeps         = _mm_add_ps(vfeps,vfeps);
2150             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2151
2152             /* CUBIC SPLINE TABLE DISPERSION */
2153             vfitab           = _mm_add_epi32(vfitab,ifour);
2154             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2155             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2156             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2157             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2158             _MM_TRANSPOSE4_PS(Y,F,G,H);
2159             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2160             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2161             fvdw6            = _mm_mul_ps(c6_00,FF);
2162
2163             /* CUBIC SPLINE TABLE REPULSION */
2164             vfitab           = _mm_add_epi32(vfitab,ifour);
2165             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2166             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2167             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2168             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2169             _MM_TRANSPOSE4_PS(Y,F,G,H);
2170             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2171             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2172             fvdw12           = _mm_mul_ps(c12_00,FF);
2173             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2174
2175             fscal            = fvdw;
2176
2177             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2178
2179              /* Update vectorial force */
2180             fix0             = _mm_macc_ps(dx00,fscal,fix0);
2181             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
2182             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
2183
2184             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
2185             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
2186             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
2187
2188             /**************************
2189              * CALCULATE INTERACTIONS *
2190              **************************/
2191
2192             r11              = _mm_mul_ps(rsq11,rinv11);
2193             r11              = _mm_andnot_ps(dummy_mask,r11);
2194
2195             /* Calculate table index by multiplying r with table scale and truncate to integer */
2196             rt               = _mm_mul_ps(r11,vftabscale);
2197             vfitab           = _mm_cvttps_epi32(rt);
2198 #ifdef __XOP__
2199             vfeps            = _mm_frcz_ps(rt);
2200 #else
2201             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2202 #endif
2203             twovfeps         = _mm_add_ps(vfeps,vfeps);
2204             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2205
2206             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2207             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2208             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2209             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2210             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2211             _MM_TRANSPOSE4_PS(Y,F,G,H);
2212             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2213             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2214             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2215
2216             fscal            = felec;
2217
2218             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2219
2220              /* Update vectorial force */
2221             fix1             = _mm_macc_ps(dx11,fscal,fix1);
2222             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
2223             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
2224
2225             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
2226             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
2227             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
2228
2229             /**************************
2230              * CALCULATE INTERACTIONS *
2231              **************************/
2232
2233             r12              = _mm_mul_ps(rsq12,rinv12);
2234             r12              = _mm_andnot_ps(dummy_mask,r12);
2235
2236             /* Calculate table index by multiplying r with table scale and truncate to integer */
2237             rt               = _mm_mul_ps(r12,vftabscale);
2238             vfitab           = _mm_cvttps_epi32(rt);
2239 #ifdef __XOP__
2240             vfeps            = _mm_frcz_ps(rt);
2241 #else
2242             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2243 #endif
2244             twovfeps         = _mm_add_ps(vfeps,vfeps);
2245             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2246
2247             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2248             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2249             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2250             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2251             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2252             _MM_TRANSPOSE4_PS(Y,F,G,H);
2253             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2254             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2255             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2256
2257             fscal            = felec;
2258
2259             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2260
2261              /* Update vectorial force */
2262             fix1             = _mm_macc_ps(dx12,fscal,fix1);
2263             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
2264             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
2265
2266             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
2267             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
2268             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
2269
2270             /**************************
2271              * CALCULATE INTERACTIONS *
2272              **************************/
2273
2274             r13              = _mm_mul_ps(rsq13,rinv13);
2275             r13              = _mm_andnot_ps(dummy_mask,r13);
2276
2277             /* Calculate table index by multiplying r with table scale and truncate to integer */
2278             rt               = _mm_mul_ps(r13,vftabscale);
2279             vfitab           = _mm_cvttps_epi32(rt);
2280 #ifdef __XOP__
2281             vfeps            = _mm_frcz_ps(rt);
2282 #else
2283             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2284 #endif
2285             twovfeps         = _mm_add_ps(vfeps,vfeps);
2286             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2287
2288             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2289             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2290             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2291             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2292             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2293             _MM_TRANSPOSE4_PS(Y,F,G,H);
2294             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2295             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2296             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
2297
2298             fscal            = felec;
2299
2300             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2301
2302              /* Update vectorial force */
2303             fix1             = _mm_macc_ps(dx13,fscal,fix1);
2304             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
2305             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
2306
2307             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
2308             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
2309             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
2310
2311             /**************************
2312              * CALCULATE INTERACTIONS *
2313              **************************/
2314
2315             r21              = _mm_mul_ps(rsq21,rinv21);
2316             r21              = _mm_andnot_ps(dummy_mask,r21);
2317
2318             /* Calculate table index by multiplying r with table scale and truncate to integer */
2319             rt               = _mm_mul_ps(r21,vftabscale);
2320             vfitab           = _mm_cvttps_epi32(rt);
2321 #ifdef __XOP__
2322             vfeps            = _mm_frcz_ps(rt);
2323 #else
2324             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2325 #endif
2326             twovfeps         = _mm_add_ps(vfeps,vfeps);
2327             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2328
2329             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2330             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2331             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2332             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2333             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2334             _MM_TRANSPOSE4_PS(Y,F,G,H);
2335             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2336             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2337             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2338
2339             fscal            = felec;
2340
2341             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2342
2343              /* Update vectorial force */
2344             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2345             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2346             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2347
2348             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2349             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2350             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2351
2352             /**************************
2353              * CALCULATE INTERACTIONS *
2354              **************************/
2355
2356             r22              = _mm_mul_ps(rsq22,rinv22);
2357             r22              = _mm_andnot_ps(dummy_mask,r22);
2358
2359             /* Calculate table index by multiplying r with table scale and truncate to integer */
2360             rt               = _mm_mul_ps(r22,vftabscale);
2361             vfitab           = _mm_cvttps_epi32(rt);
2362 #ifdef __XOP__
2363             vfeps            = _mm_frcz_ps(rt);
2364 #else
2365             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2366 #endif
2367             twovfeps         = _mm_add_ps(vfeps,vfeps);
2368             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2369
2370             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2371             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2372             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2373             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2374             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2375             _MM_TRANSPOSE4_PS(Y,F,G,H);
2376             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2377             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2378             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2379
2380             fscal            = felec;
2381
2382             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2383
2384              /* Update vectorial force */
2385             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2386             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2387             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2388
2389             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2390             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2391             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2392
2393             /**************************
2394              * CALCULATE INTERACTIONS *
2395              **************************/
2396
2397             r23              = _mm_mul_ps(rsq23,rinv23);
2398             r23              = _mm_andnot_ps(dummy_mask,r23);
2399
2400             /* Calculate table index by multiplying r with table scale and truncate to integer */
2401             rt               = _mm_mul_ps(r23,vftabscale);
2402             vfitab           = _mm_cvttps_epi32(rt);
2403 #ifdef __XOP__
2404             vfeps            = _mm_frcz_ps(rt);
2405 #else
2406             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2407 #endif
2408             twovfeps         = _mm_add_ps(vfeps,vfeps);
2409             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2410
2411             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2412             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2413             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2414             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2415             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2416             _MM_TRANSPOSE4_PS(Y,F,G,H);
2417             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2418             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2419             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
2420
2421             fscal            = felec;
2422
2423             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2424
2425              /* Update vectorial force */
2426             fix2             = _mm_macc_ps(dx23,fscal,fix2);
2427             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
2428             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
2429
2430             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
2431             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
2432             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
2433
2434             /**************************
2435              * CALCULATE INTERACTIONS *
2436              **************************/
2437
2438             r31              = _mm_mul_ps(rsq31,rinv31);
2439             r31              = _mm_andnot_ps(dummy_mask,r31);
2440
2441             /* Calculate table index by multiplying r with table scale and truncate to integer */
2442             rt               = _mm_mul_ps(r31,vftabscale);
2443             vfitab           = _mm_cvttps_epi32(rt);
2444 #ifdef __XOP__
2445             vfeps            = _mm_frcz_ps(rt);
2446 #else
2447             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2448 #endif
2449             twovfeps         = _mm_add_ps(vfeps,vfeps);
2450             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2451
2452             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2453             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2454             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2455             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2456             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2457             _MM_TRANSPOSE4_PS(Y,F,G,H);
2458             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2459             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2460             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
2461
2462             fscal            = felec;
2463
2464             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2465
2466              /* Update vectorial force */
2467             fix3             = _mm_macc_ps(dx31,fscal,fix3);
2468             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
2469             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
2470
2471             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
2472             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
2473             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
2474
2475             /**************************
2476              * CALCULATE INTERACTIONS *
2477              **************************/
2478
2479             r32              = _mm_mul_ps(rsq32,rinv32);
2480             r32              = _mm_andnot_ps(dummy_mask,r32);
2481
2482             /* Calculate table index by multiplying r with table scale and truncate to integer */
2483             rt               = _mm_mul_ps(r32,vftabscale);
2484             vfitab           = _mm_cvttps_epi32(rt);
2485 #ifdef __XOP__
2486             vfeps            = _mm_frcz_ps(rt);
2487 #else
2488             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2489 #endif
2490             twovfeps         = _mm_add_ps(vfeps,vfeps);
2491             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2492
2493             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2494             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2495             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2496             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2497             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2498             _MM_TRANSPOSE4_PS(Y,F,G,H);
2499             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2500             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2501             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
2502
2503             fscal            = felec;
2504
2505             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2506
2507              /* Update vectorial force */
2508             fix3             = _mm_macc_ps(dx32,fscal,fix3);
2509             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
2510             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
2511
2512             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
2513             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
2514             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
2515
2516             /**************************
2517              * CALCULATE INTERACTIONS *
2518              **************************/
2519
2520             r33              = _mm_mul_ps(rsq33,rinv33);
2521             r33              = _mm_andnot_ps(dummy_mask,r33);
2522
2523             /* Calculate table index by multiplying r with table scale and truncate to integer */
2524             rt               = _mm_mul_ps(r33,vftabscale);
2525             vfitab           = _mm_cvttps_epi32(rt);
2526 #ifdef __XOP__
2527             vfeps            = _mm_frcz_ps(rt);
2528 #else
2529             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2530 #endif
2531             twovfeps         = _mm_add_ps(vfeps,vfeps);
2532             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2533
2534             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2535             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2536             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2537             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2538             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2539             _MM_TRANSPOSE4_PS(Y,F,G,H);
2540             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2541             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2542             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2543
2544             fscal            = felec;
2545
2546             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2547
2548              /* Update vectorial force */
2549             fix3             = _mm_macc_ps(dx33,fscal,fix3);
2550             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
2551             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
2552
2553             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
2554             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
2555             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
2556
2557             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2558             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2559             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2560             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2561
2562             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2563                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2564                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2565
2566             /* Inner loop uses 442 flops */
2567         }
2568
2569         /* End of innermost loop */
2570
2571         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2572                                               f+i_coord_offset,fshift+i_shift_offset);
2573
2574         /* Increment number of inner iterations */
2575         inneriter                  += j_index_end - j_index_start;
2576
2577         /* Outer loop uses 24 flops */
2578     }
2579
2580     /* Increment number of outer iterations */
2581     outeriter        += nri;
2582
2583     /* Update outer/inner flops */
2584
2585     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*442);
2586 }