Created SIMD module
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
50 #include "kernelutil_x86_avx_128_fma_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_single
54  * Electrostatics interaction: CubicSplineTable
55  * VdW interaction:            CubicSplineTable
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_128_fma_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset0;
86     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     int              vdwioffset1;
88     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     int              vdwioffset2;
90     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwioffset3;
92     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
96     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
97     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
98     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
99     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
100     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
101     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
102     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
105     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
108     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
109     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
110     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
111     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
112     real             *charge;
113     int              nvdwtype;
114     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115     int              *vdwtype;
116     real             *vdwparam;
117     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
118     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
119     __m128i          vfitab;
120     __m128i          ifour       = _mm_set1_epi32(4);
121     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
122     real             *vftab;
123     __m128           dummy_mask,cutoff_mask;
124     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
125     __m128           one     = _mm_set1_ps(1.0);
126     __m128           two     = _mm_set1_ps(2.0);
127     x                = xx[0];
128     f                = ff[0];
129
130     nri              = nlist->nri;
131     iinr             = nlist->iinr;
132     jindex           = nlist->jindex;
133     jjnr             = nlist->jjnr;
134     shiftidx         = nlist->shift;
135     gid              = nlist->gid;
136     shiftvec         = fr->shift_vec[0];
137     fshift           = fr->fshift[0];
138     facel            = _mm_set1_ps(fr->epsfac);
139     charge           = mdatoms->chargeA;
140     nvdwtype         = fr->ntype;
141     vdwparam         = fr->nbfp;
142     vdwtype          = mdatoms->typeA;
143
144     vftab            = kernel_data->table_elec_vdw->data;
145     vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
146
147     /* Setup water-specific parameters */
148     inr              = nlist->iinr[0];
149     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
150     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
151     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
152     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
153
154     jq1              = _mm_set1_ps(charge[inr+1]);
155     jq2              = _mm_set1_ps(charge[inr+2]);
156     jq3              = _mm_set1_ps(charge[inr+3]);
157     vdwjidx0A        = 2*vdwtype[inr+0];
158     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
159     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
160     qq11             = _mm_mul_ps(iq1,jq1);
161     qq12             = _mm_mul_ps(iq1,jq2);
162     qq13             = _mm_mul_ps(iq1,jq3);
163     qq21             = _mm_mul_ps(iq2,jq1);
164     qq22             = _mm_mul_ps(iq2,jq2);
165     qq23             = _mm_mul_ps(iq2,jq3);
166     qq31             = _mm_mul_ps(iq3,jq1);
167     qq32             = _mm_mul_ps(iq3,jq2);
168     qq33             = _mm_mul_ps(iq3,jq3);
169
170     /* Avoid stupid compiler warnings */
171     jnrA = jnrB = jnrC = jnrD = 0;
172     j_coord_offsetA = 0;
173     j_coord_offsetB = 0;
174     j_coord_offsetC = 0;
175     j_coord_offsetD = 0;
176
177     outeriter        = 0;
178     inneriter        = 0;
179
180     for(iidx=0;iidx<4*DIM;iidx++)
181     {
182         scratch[iidx] = 0.0;
183     }
184
185     /* Start outer loop over neighborlists */
186     for(iidx=0; iidx<nri; iidx++)
187     {
188         /* Load shift vector for this list */
189         i_shift_offset   = DIM*shiftidx[iidx];
190
191         /* Load limits for loop over neighbors */
192         j_index_start    = jindex[iidx];
193         j_index_end      = jindex[iidx+1];
194
195         /* Get outer coordinate index */
196         inr              = iinr[iidx];
197         i_coord_offset   = DIM*inr;
198
199         /* Load i particle coords and add shift vector */
200         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
201                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
202
203         fix0             = _mm_setzero_ps();
204         fiy0             = _mm_setzero_ps();
205         fiz0             = _mm_setzero_ps();
206         fix1             = _mm_setzero_ps();
207         fiy1             = _mm_setzero_ps();
208         fiz1             = _mm_setzero_ps();
209         fix2             = _mm_setzero_ps();
210         fiy2             = _mm_setzero_ps();
211         fiz2             = _mm_setzero_ps();
212         fix3             = _mm_setzero_ps();
213         fiy3             = _mm_setzero_ps();
214         fiz3             = _mm_setzero_ps();
215
216         /* Reset potential sums */
217         velecsum         = _mm_setzero_ps();
218         vvdwsum          = _mm_setzero_ps();
219
220         /* Start inner kernel loop */
221         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
222         {
223
224             /* Get j neighbor index, and coordinate index */
225             jnrA             = jjnr[jidx];
226             jnrB             = jjnr[jidx+1];
227             jnrC             = jjnr[jidx+2];
228             jnrD             = jjnr[jidx+3];
229             j_coord_offsetA  = DIM*jnrA;
230             j_coord_offsetB  = DIM*jnrB;
231             j_coord_offsetC  = DIM*jnrC;
232             j_coord_offsetD  = DIM*jnrD;
233
234             /* load j atom coordinates */
235             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
236                                               x+j_coord_offsetC,x+j_coord_offsetD,
237                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
238                                               &jy2,&jz2,&jx3,&jy3,&jz3);
239
240             /* Calculate displacement vector */
241             dx00             = _mm_sub_ps(ix0,jx0);
242             dy00             = _mm_sub_ps(iy0,jy0);
243             dz00             = _mm_sub_ps(iz0,jz0);
244             dx11             = _mm_sub_ps(ix1,jx1);
245             dy11             = _mm_sub_ps(iy1,jy1);
246             dz11             = _mm_sub_ps(iz1,jz1);
247             dx12             = _mm_sub_ps(ix1,jx2);
248             dy12             = _mm_sub_ps(iy1,jy2);
249             dz12             = _mm_sub_ps(iz1,jz2);
250             dx13             = _mm_sub_ps(ix1,jx3);
251             dy13             = _mm_sub_ps(iy1,jy3);
252             dz13             = _mm_sub_ps(iz1,jz3);
253             dx21             = _mm_sub_ps(ix2,jx1);
254             dy21             = _mm_sub_ps(iy2,jy1);
255             dz21             = _mm_sub_ps(iz2,jz1);
256             dx22             = _mm_sub_ps(ix2,jx2);
257             dy22             = _mm_sub_ps(iy2,jy2);
258             dz22             = _mm_sub_ps(iz2,jz2);
259             dx23             = _mm_sub_ps(ix2,jx3);
260             dy23             = _mm_sub_ps(iy2,jy3);
261             dz23             = _mm_sub_ps(iz2,jz3);
262             dx31             = _mm_sub_ps(ix3,jx1);
263             dy31             = _mm_sub_ps(iy3,jy1);
264             dz31             = _mm_sub_ps(iz3,jz1);
265             dx32             = _mm_sub_ps(ix3,jx2);
266             dy32             = _mm_sub_ps(iy3,jy2);
267             dz32             = _mm_sub_ps(iz3,jz2);
268             dx33             = _mm_sub_ps(ix3,jx3);
269             dy33             = _mm_sub_ps(iy3,jy3);
270             dz33             = _mm_sub_ps(iz3,jz3);
271
272             /* Calculate squared distance and things based on it */
273             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
274             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
275             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
276             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
277             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
278             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
279             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
280             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
281             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
282             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
283
284             rinv00           = gmx_mm_invsqrt_ps(rsq00);
285             rinv11           = gmx_mm_invsqrt_ps(rsq11);
286             rinv12           = gmx_mm_invsqrt_ps(rsq12);
287             rinv13           = gmx_mm_invsqrt_ps(rsq13);
288             rinv21           = gmx_mm_invsqrt_ps(rsq21);
289             rinv22           = gmx_mm_invsqrt_ps(rsq22);
290             rinv23           = gmx_mm_invsqrt_ps(rsq23);
291             rinv31           = gmx_mm_invsqrt_ps(rsq31);
292             rinv32           = gmx_mm_invsqrt_ps(rsq32);
293             rinv33           = gmx_mm_invsqrt_ps(rsq33);
294
295             fjx0             = _mm_setzero_ps();
296             fjy0             = _mm_setzero_ps();
297             fjz0             = _mm_setzero_ps();
298             fjx1             = _mm_setzero_ps();
299             fjy1             = _mm_setzero_ps();
300             fjz1             = _mm_setzero_ps();
301             fjx2             = _mm_setzero_ps();
302             fjy2             = _mm_setzero_ps();
303             fjz2             = _mm_setzero_ps();
304             fjx3             = _mm_setzero_ps();
305             fjy3             = _mm_setzero_ps();
306             fjz3             = _mm_setzero_ps();
307
308             /**************************
309              * CALCULATE INTERACTIONS *
310              **************************/
311
312             r00              = _mm_mul_ps(rsq00,rinv00);
313
314             /* Calculate table index by multiplying r with table scale and truncate to integer */
315             rt               = _mm_mul_ps(r00,vftabscale);
316             vfitab           = _mm_cvttps_epi32(rt);
317 #ifdef __XOP__
318             vfeps            = _mm_frcz_ps(rt);
319 #else
320             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
321 #endif
322             twovfeps         = _mm_add_ps(vfeps,vfeps);
323             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
324
325             /* CUBIC SPLINE TABLE DISPERSION */
326             vfitab           = _mm_add_epi32(vfitab,ifour);
327             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
328             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
329             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
330             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
331             _MM_TRANSPOSE4_PS(Y,F,G,H);
332             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
333             VV               = _mm_macc_ps(vfeps,Fp,Y);
334             vvdw6            = _mm_mul_ps(c6_00,VV);
335             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
336             fvdw6            = _mm_mul_ps(c6_00,FF);
337
338             /* CUBIC SPLINE TABLE REPULSION */
339             vfitab           = _mm_add_epi32(vfitab,ifour);
340             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
341             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
342             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
343             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
344             _MM_TRANSPOSE4_PS(Y,F,G,H);
345             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
346             VV               = _mm_macc_ps(vfeps,Fp,Y);
347             vvdw12           = _mm_mul_ps(c12_00,VV);
348             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
349             fvdw12           = _mm_mul_ps(c12_00,FF);
350             vvdw             = _mm_add_ps(vvdw12,vvdw6);
351             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
352
353             /* Update potential sum for this i atom from the interaction with this j atom. */
354             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
355
356             fscal            = fvdw;
357
358              /* Update vectorial force */
359             fix0             = _mm_macc_ps(dx00,fscal,fix0);
360             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
361             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
362
363             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
364             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
365             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
366
367             /**************************
368              * CALCULATE INTERACTIONS *
369              **************************/
370
371             r11              = _mm_mul_ps(rsq11,rinv11);
372
373             /* Calculate table index by multiplying r with table scale and truncate to integer */
374             rt               = _mm_mul_ps(r11,vftabscale);
375             vfitab           = _mm_cvttps_epi32(rt);
376 #ifdef __XOP__
377             vfeps            = _mm_frcz_ps(rt);
378 #else
379             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
380 #endif
381             twovfeps         = _mm_add_ps(vfeps,vfeps);
382             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
383
384             /* CUBIC SPLINE TABLE ELECTROSTATICS */
385             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
386             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
387             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
388             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
389             _MM_TRANSPOSE4_PS(Y,F,G,H);
390             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
391             VV               = _mm_macc_ps(vfeps,Fp,Y);
392             velec            = _mm_mul_ps(qq11,VV);
393             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
394             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
395
396             /* Update potential sum for this i atom from the interaction with this j atom. */
397             velecsum         = _mm_add_ps(velecsum,velec);
398
399             fscal            = felec;
400
401              /* Update vectorial force */
402             fix1             = _mm_macc_ps(dx11,fscal,fix1);
403             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
404             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
405
406             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
407             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
408             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
409
410             /**************************
411              * CALCULATE INTERACTIONS *
412              **************************/
413
414             r12              = _mm_mul_ps(rsq12,rinv12);
415
416             /* Calculate table index by multiplying r with table scale and truncate to integer */
417             rt               = _mm_mul_ps(r12,vftabscale);
418             vfitab           = _mm_cvttps_epi32(rt);
419 #ifdef __XOP__
420             vfeps            = _mm_frcz_ps(rt);
421 #else
422             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
423 #endif
424             twovfeps         = _mm_add_ps(vfeps,vfeps);
425             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
426
427             /* CUBIC SPLINE TABLE ELECTROSTATICS */
428             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
429             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
430             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
431             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
432             _MM_TRANSPOSE4_PS(Y,F,G,H);
433             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
434             VV               = _mm_macc_ps(vfeps,Fp,Y);
435             velec            = _mm_mul_ps(qq12,VV);
436             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
437             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
438
439             /* Update potential sum for this i atom from the interaction with this j atom. */
440             velecsum         = _mm_add_ps(velecsum,velec);
441
442             fscal            = felec;
443
444              /* Update vectorial force */
445             fix1             = _mm_macc_ps(dx12,fscal,fix1);
446             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
447             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
448
449             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
450             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
451             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
452
453             /**************************
454              * CALCULATE INTERACTIONS *
455              **************************/
456
457             r13              = _mm_mul_ps(rsq13,rinv13);
458
459             /* Calculate table index by multiplying r with table scale and truncate to integer */
460             rt               = _mm_mul_ps(r13,vftabscale);
461             vfitab           = _mm_cvttps_epi32(rt);
462 #ifdef __XOP__
463             vfeps            = _mm_frcz_ps(rt);
464 #else
465             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
466 #endif
467             twovfeps         = _mm_add_ps(vfeps,vfeps);
468             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
469
470             /* CUBIC SPLINE TABLE ELECTROSTATICS */
471             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
472             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
473             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
474             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
475             _MM_TRANSPOSE4_PS(Y,F,G,H);
476             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
477             VV               = _mm_macc_ps(vfeps,Fp,Y);
478             velec            = _mm_mul_ps(qq13,VV);
479             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
480             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
481
482             /* Update potential sum for this i atom from the interaction with this j atom. */
483             velecsum         = _mm_add_ps(velecsum,velec);
484
485             fscal            = felec;
486
487              /* Update vectorial force */
488             fix1             = _mm_macc_ps(dx13,fscal,fix1);
489             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
490             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
491
492             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
493             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
494             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
495
496             /**************************
497              * CALCULATE INTERACTIONS *
498              **************************/
499
500             r21              = _mm_mul_ps(rsq21,rinv21);
501
502             /* Calculate table index by multiplying r with table scale and truncate to integer */
503             rt               = _mm_mul_ps(r21,vftabscale);
504             vfitab           = _mm_cvttps_epi32(rt);
505 #ifdef __XOP__
506             vfeps            = _mm_frcz_ps(rt);
507 #else
508             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
509 #endif
510             twovfeps         = _mm_add_ps(vfeps,vfeps);
511             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
512
513             /* CUBIC SPLINE TABLE ELECTROSTATICS */
514             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
515             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
516             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
517             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
518             _MM_TRANSPOSE4_PS(Y,F,G,H);
519             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
520             VV               = _mm_macc_ps(vfeps,Fp,Y);
521             velec            = _mm_mul_ps(qq21,VV);
522             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
523             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
524
525             /* Update potential sum for this i atom from the interaction with this j atom. */
526             velecsum         = _mm_add_ps(velecsum,velec);
527
528             fscal            = felec;
529
530              /* Update vectorial force */
531             fix2             = _mm_macc_ps(dx21,fscal,fix2);
532             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
533             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
534
535             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
536             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
537             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
538
539             /**************************
540              * CALCULATE INTERACTIONS *
541              **************************/
542
543             r22              = _mm_mul_ps(rsq22,rinv22);
544
545             /* Calculate table index by multiplying r with table scale and truncate to integer */
546             rt               = _mm_mul_ps(r22,vftabscale);
547             vfitab           = _mm_cvttps_epi32(rt);
548 #ifdef __XOP__
549             vfeps            = _mm_frcz_ps(rt);
550 #else
551             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
552 #endif
553             twovfeps         = _mm_add_ps(vfeps,vfeps);
554             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
555
556             /* CUBIC SPLINE TABLE ELECTROSTATICS */
557             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
558             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
559             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
560             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
561             _MM_TRANSPOSE4_PS(Y,F,G,H);
562             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
563             VV               = _mm_macc_ps(vfeps,Fp,Y);
564             velec            = _mm_mul_ps(qq22,VV);
565             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
566             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
567
568             /* Update potential sum for this i atom from the interaction with this j atom. */
569             velecsum         = _mm_add_ps(velecsum,velec);
570
571             fscal            = felec;
572
573              /* Update vectorial force */
574             fix2             = _mm_macc_ps(dx22,fscal,fix2);
575             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
576             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
577
578             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
579             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
580             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
581
582             /**************************
583              * CALCULATE INTERACTIONS *
584              **************************/
585
586             r23              = _mm_mul_ps(rsq23,rinv23);
587
588             /* Calculate table index by multiplying r with table scale and truncate to integer */
589             rt               = _mm_mul_ps(r23,vftabscale);
590             vfitab           = _mm_cvttps_epi32(rt);
591 #ifdef __XOP__
592             vfeps            = _mm_frcz_ps(rt);
593 #else
594             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
595 #endif
596             twovfeps         = _mm_add_ps(vfeps,vfeps);
597             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
598
599             /* CUBIC SPLINE TABLE ELECTROSTATICS */
600             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
601             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
602             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
603             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
604             _MM_TRANSPOSE4_PS(Y,F,G,H);
605             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
606             VV               = _mm_macc_ps(vfeps,Fp,Y);
607             velec            = _mm_mul_ps(qq23,VV);
608             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
609             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
610
611             /* Update potential sum for this i atom from the interaction with this j atom. */
612             velecsum         = _mm_add_ps(velecsum,velec);
613
614             fscal            = felec;
615
616              /* Update vectorial force */
617             fix2             = _mm_macc_ps(dx23,fscal,fix2);
618             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
619             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
620
621             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
622             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
623             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
624
625             /**************************
626              * CALCULATE INTERACTIONS *
627              **************************/
628
629             r31              = _mm_mul_ps(rsq31,rinv31);
630
631             /* Calculate table index by multiplying r with table scale and truncate to integer */
632             rt               = _mm_mul_ps(r31,vftabscale);
633             vfitab           = _mm_cvttps_epi32(rt);
634 #ifdef __XOP__
635             vfeps            = _mm_frcz_ps(rt);
636 #else
637             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
638 #endif
639             twovfeps         = _mm_add_ps(vfeps,vfeps);
640             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
641
642             /* CUBIC SPLINE TABLE ELECTROSTATICS */
643             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
644             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
645             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
646             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
647             _MM_TRANSPOSE4_PS(Y,F,G,H);
648             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
649             VV               = _mm_macc_ps(vfeps,Fp,Y);
650             velec            = _mm_mul_ps(qq31,VV);
651             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
652             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
653
654             /* Update potential sum for this i atom from the interaction with this j atom. */
655             velecsum         = _mm_add_ps(velecsum,velec);
656
657             fscal            = felec;
658
659              /* Update vectorial force */
660             fix3             = _mm_macc_ps(dx31,fscal,fix3);
661             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
662             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
663
664             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
665             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
666             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
667
668             /**************************
669              * CALCULATE INTERACTIONS *
670              **************************/
671
672             r32              = _mm_mul_ps(rsq32,rinv32);
673
674             /* Calculate table index by multiplying r with table scale and truncate to integer */
675             rt               = _mm_mul_ps(r32,vftabscale);
676             vfitab           = _mm_cvttps_epi32(rt);
677 #ifdef __XOP__
678             vfeps            = _mm_frcz_ps(rt);
679 #else
680             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
681 #endif
682             twovfeps         = _mm_add_ps(vfeps,vfeps);
683             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
684
685             /* CUBIC SPLINE TABLE ELECTROSTATICS */
686             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
687             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
688             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
689             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
690             _MM_TRANSPOSE4_PS(Y,F,G,H);
691             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
692             VV               = _mm_macc_ps(vfeps,Fp,Y);
693             velec            = _mm_mul_ps(qq32,VV);
694             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
695             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
696
697             /* Update potential sum for this i atom from the interaction with this j atom. */
698             velecsum         = _mm_add_ps(velecsum,velec);
699
700             fscal            = felec;
701
702              /* Update vectorial force */
703             fix3             = _mm_macc_ps(dx32,fscal,fix3);
704             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
705             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
706
707             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
708             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
709             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
710
711             /**************************
712              * CALCULATE INTERACTIONS *
713              **************************/
714
715             r33              = _mm_mul_ps(rsq33,rinv33);
716
717             /* Calculate table index by multiplying r with table scale and truncate to integer */
718             rt               = _mm_mul_ps(r33,vftabscale);
719             vfitab           = _mm_cvttps_epi32(rt);
720 #ifdef __XOP__
721             vfeps            = _mm_frcz_ps(rt);
722 #else
723             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
724 #endif
725             twovfeps         = _mm_add_ps(vfeps,vfeps);
726             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
727
728             /* CUBIC SPLINE TABLE ELECTROSTATICS */
729             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
730             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
731             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
732             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
733             _MM_TRANSPOSE4_PS(Y,F,G,H);
734             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
735             VV               = _mm_macc_ps(vfeps,Fp,Y);
736             velec            = _mm_mul_ps(qq33,VV);
737             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
738             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
739
740             /* Update potential sum for this i atom from the interaction with this j atom. */
741             velecsum         = _mm_add_ps(velecsum,velec);
742
743             fscal            = felec;
744
745              /* Update vectorial force */
746             fix3             = _mm_macc_ps(dx33,fscal,fix3);
747             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
748             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
749
750             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
751             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
752             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
753
754             fjptrA             = f+j_coord_offsetA;
755             fjptrB             = f+j_coord_offsetB;
756             fjptrC             = f+j_coord_offsetC;
757             fjptrD             = f+j_coord_offsetD;
758
759             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
760                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
761                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
762
763             /* Inner loop uses 476 flops */
764         }
765
766         if(jidx<j_index_end)
767         {
768
769             /* Get j neighbor index, and coordinate index */
770             jnrlistA         = jjnr[jidx];
771             jnrlistB         = jjnr[jidx+1];
772             jnrlistC         = jjnr[jidx+2];
773             jnrlistD         = jjnr[jidx+3];
774             /* Sign of each element will be negative for non-real atoms.
775              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
776              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
777              */
778             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
779             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
780             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
781             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
782             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
783             j_coord_offsetA  = DIM*jnrA;
784             j_coord_offsetB  = DIM*jnrB;
785             j_coord_offsetC  = DIM*jnrC;
786             j_coord_offsetD  = DIM*jnrD;
787
788             /* load j atom coordinates */
789             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
790                                               x+j_coord_offsetC,x+j_coord_offsetD,
791                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
792                                               &jy2,&jz2,&jx3,&jy3,&jz3);
793
794             /* Calculate displacement vector */
795             dx00             = _mm_sub_ps(ix0,jx0);
796             dy00             = _mm_sub_ps(iy0,jy0);
797             dz00             = _mm_sub_ps(iz0,jz0);
798             dx11             = _mm_sub_ps(ix1,jx1);
799             dy11             = _mm_sub_ps(iy1,jy1);
800             dz11             = _mm_sub_ps(iz1,jz1);
801             dx12             = _mm_sub_ps(ix1,jx2);
802             dy12             = _mm_sub_ps(iy1,jy2);
803             dz12             = _mm_sub_ps(iz1,jz2);
804             dx13             = _mm_sub_ps(ix1,jx3);
805             dy13             = _mm_sub_ps(iy1,jy3);
806             dz13             = _mm_sub_ps(iz1,jz3);
807             dx21             = _mm_sub_ps(ix2,jx1);
808             dy21             = _mm_sub_ps(iy2,jy1);
809             dz21             = _mm_sub_ps(iz2,jz1);
810             dx22             = _mm_sub_ps(ix2,jx2);
811             dy22             = _mm_sub_ps(iy2,jy2);
812             dz22             = _mm_sub_ps(iz2,jz2);
813             dx23             = _mm_sub_ps(ix2,jx3);
814             dy23             = _mm_sub_ps(iy2,jy3);
815             dz23             = _mm_sub_ps(iz2,jz3);
816             dx31             = _mm_sub_ps(ix3,jx1);
817             dy31             = _mm_sub_ps(iy3,jy1);
818             dz31             = _mm_sub_ps(iz3,jz1);
819             dx32             = _mm_sub_ps(ix3,jx2);
820             dy32             = _mm_sub_ps(iy3,jy2);
821             dz32             = _mm_sub_ps(iz3,jz2);
822             dx33             = _mm_sub_ps(ix3,jx3);
823             dy33             = _mm_sub_ps(iy3,jy3);
824             dz33             = _mm_sub_ps(iz3,jz3);
825
826             /* Calculate squared distance and things based on it */
827             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
828             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
829             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
830             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
831             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
832             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
833             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
834             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
835             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
836             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
837
838             rinv00           = gmx_mm_invsqrt_ps(rsq00);
839             rinv11           = gmx_mm_invsqrt_ps(rsq11);
840             rinv12           = gmx_mm_invsqrt_ps(rsq12);
841             rinv13           = gmx_mm_invsqrt_ps(rsq13);
842             rinv21           = gmx_mm_invsqrt_ps(rsq21);
843             rinv22           = gmx_mm_invsqrt_ps(rsq22);
844             rinv23           = gmx_mm_invsqrt_ps(rsq23);
845             rinv31           = gmx_mm_invsqrt_ps(rsq31);
846             rinv32           = gmx_mm_invsqrt_ps(rsq32);
847             rinv33           = gmx_mm_invsqrt_ps(rsq33);
848
849             fjx0             = _mm_setzero_ps();
850             fjy0             = _mm_setzero_ps();
851             fjz0             = _mm_setzero_ps();
852             fjx1             = _mm_setzero_ps();
853             fjy1             = _mm_setzero_ps();
854             fjz1             = _mm_setzero_ps();
855             fjx2             = _mm_setzero_ps();
856             fjy2             = _mm_setzero_ps();
857             fjz2             = _mm_setzero_ps();
858             fjx3             = _mm_setzero_ps();
859             fjy3             = _mm_setzero_ps();
860             fjz3             = _mm_setzero_ps();
861
862             /**************************
863              * CALCULATE INTERACTIONS *
864              **************************/
865
866             r00              = _mm_mul_ps(rsq00,rinv00);
867             r00              = _mm_andnot_ps(dummy_mask,r00);
868
869             /* Calculate table index by multiplying r with table scale and truncate to integer */
870             rt               = _mm_mul_ps(r00,vftabscale);
871             vfitab           = _mm_cvttps_epi32(rt);
872 #ifdef __XOP__
873             vfeps            = _mm_frcz_ps(rt);
874 #else
875             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
876 #endif
877             twovfeps         = _mm_add_ps(vfeps,vfeps);
878             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
879
880             /* CUBIC SPLINE TABLE DISPERSION */
881             vfitab           = _mm_add_epi32(vfitab,ifour);
882             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
883             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
884             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
885             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
886             _MM_TRANSPOSE4_PS(Y,F,G,H);
887             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
888             VV               = _mm_macc_ps(vfeps,Fp,Y);
889             vvdw6            = _mm_mul_ps(c6_00,VV);
890             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
891             fvdw6            = _mm_mul_ps(c6_00,FF);
892
893             /* CUBIC SPLINE TABLE REPULSION */
894             vfitab           = _mm_add_epi32(vfitab,ifour);
895             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
896             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
897             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
898             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
899             _MM_TRANSPOSE4_PS(Y,F,G,H);
900             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
901             VV               = _mm_macc_ps(vfeps,Fp,Y);
902             vvdw12           = _mm_mul_ps(c12_00,VV);
903             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
904             fvdw12           = _mm_mul_ps(c12_00,FF);
905             vvdw             = _mm_add_ps(vvdw12,vvdw6);
906             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
907
908             /* Update potential sum for this i atom from the interaction with this j atom. */
909             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
910             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
911
912             fscal            = fvdw;
913
914             fscal            = _mm_andnot_ps(dummy_mask,fscal);
915
916              /* Update vectorial force */
917             fix0             = _mm_macc_ps(dx00,fscal,fix0);
918             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
919             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
920
921             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
922             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
923             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
924
925             /**************************
926              * CALCULATE INTERACTIONS *
927              **************************/
928
929             r11              = _mm_mul_ps(rsq11,rinv11);
930             r11              = _mm_andnot_ps(dummy_mask,r11);
931
932             /* Calculate table index by multiplying r with table scale and truncate to integer */
933             rt               = _mm_mul_ps(r11,vftabscale);
934             vfitab           = _mm_cvttps_epi32(rt);
935 #ifdef __XOP__
936             vfeps            = _mm_frcz_ps(rt);
937 #else
938             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
939 #endif
940             twovfeps         = _mm_add_ps(vfeps,vfeps);
941             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
942
943             /* CUBIC SPLINE TABLE ELECTROSTATICS */
944             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
945             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
946             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
947             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
948             _MM_TRANSPOSE4_PS(Y,F,G,H);
949             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
950             VV               = _mm_macc_ps(vfeps,Fp,Y);
951             velec            = _mm_mul_ps(qq11,VV);
952             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
953             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
954
955             /* Update potential sum for this i atom from the interaction with this j atom. */
956             velec            = _mm_andnot_ps(dummy_mask,velec);
957             velecsum         = _mm_add_ps(velecsum,velec);
958
959             fscal            = felec;
960
961             fscal            = _mm_andnot_ps(dummy_mask,fscal);
962
963              /* Update vectorial force */
964             fix1             = _mm_macc_ps(dx11,fscal,fix1);
965             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
966             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
967
968             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
969             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
970             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
971
972             /**************************
973              * CALCULATE INTERACTIONS *
974              **************************/
975
976             r12              = _mm_mul_ps(rsq12,rinv12);
977             r12              = _mm_andnot_ps(dummy_mask,r12);
978
979             /* Calculate table index by multiplying r with table scale and truncate to integer */
980             rt               = _mm_mul_ps(r12,vftabscale);
981             vfitab           = _mm_cvttps_epi32(rt);
982 #ifdef __XOP__
983             vfeps            = _mm_frcz_ps(rt);
984 #else
985             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
986 #endif
987             twovfeps         = _mm_add_ps(vfeps,vfeps);
988             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
989
990             /* CUBIC SPLINE TABLE ELECTROSTATICS */
991             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
992             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
993             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
994             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
995             _MM_TRANSPOSE4_PS(Y,F,G,H);
996             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
997             VV               = _mm_macc_ps(vfeps,Fp,Y);
998             velec            = _mm_mul_ps(qq12,VV);
999             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1000             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1001
1002             /* Update potential sum for this i atom from the interaction with this j atom. */
1003             velec            = _mm_andnot_ps(dummy_mask,velec);
1004             velecsum         = _mm_add_ps(velecsum,velec);
1005
1006             fscal            = felec;
1007
1008             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1009
1010              /* Update vectorial force */
1011             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1012             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1013             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1014
1015             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1016             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1017             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1018
1019             /**************************
1020              * CALCULATE INTERACTIONS *
1021              **************************/
1022
1023             r13              = _mm_mul_ps(rsq13,rinv13);
1024             r13              = _mm_andnot_ps(dummy_mask,r13);
1025
1026             /* Calculate table index by multiplying r with table scale and truncate to integer */
1027             rt               = _mm_mul_ps(r13,vftabscale);
1028             vfitab           = _mm_cvttps_epi32(rt);
1029 #ifdef __XOP__
1030             vfeps            = _mm_frcz_ps(rt);
1031 #else
1032             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1033 #endif
1034             twovfeps         = _mm_add_ps(vfeps,vfeps);
1035             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1036
1037             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1038             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1039             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1040             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1041             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1042             _MM_TRANSPOSE4_PS(Y,F,G,H);
1043             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1044             VV               = _mm_macc_ps(vfeps,Fp,Y);
1045             velec            = _mm_mul_ps(qq13,VV);
1046             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1047             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1048
1049             /* Update potential sum for this i atom from the interaction with this j atom. */
1050             velec            = _mm_andnot_ps(dummy_mask,velec);
1051             velecsum         = _mm_add_ps(velecsum,velec);
1052
1053             fscal            = felec;
1054
1055             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1056
1057              /* Update vectorial force */
1058             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1059             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1060             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1061
1062             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1063             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1064             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1065
1066             /**************************
1067              * CALCULATE INTERACTIONS *
1068              **************************/
1069
1070             r21              = _mm_mul_ps(rsq21,rinv21);
1071             r21              = _mm_andnot_ps(dummy_mask,r21);
1072
1073             /* Calculate table index by multiplying r with table scale and truncate to integer */
1074             rt               = _mm_mul_ps(r21,vftabscale);
1075             vfitab           = _mm_cvttps_epi32(rt);
1076 #ifdef __XOP__
1077             vfeps            = _mm_frcz_ps(rt);
1078 #else
1079             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1080 #endif
1081             twovfeps         = _mm_add_ps(vfeps,vfeps);
1082             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1083
1084             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1085             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1086             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1087             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1088             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1089             _MM_TRANSPOSE4_PS(Y,F,G,H);
1090             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1091             VV               = _mm_macc_ps(vfeps,Fp,Y);
1092             velec            = _mm_mul_ps(qq21,VV);
1093             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1094             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1095
1096             /* Update potential sum for this i atom from the interaction with this j atom. */
1097             velec            = _mm_andnot_ps(dummy_mask,velec);
1098             velecsum         = _mm_add_ps(velecsum,velec);
1099
1100             fscal            = felec;
1101
1102             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1103
1104              /* Update vectorial force */
1105             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1106             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1107             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1108
1109             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1110             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1111             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1112
1113             /**************************
1114              * CALCULATE INTERACTIONS *
1115              **************************/
1116
1117             r22              = _mm_mul_ps(rsq22,rinv22);
1118             r22              = _mm_andnot_ps(dummy_mask,r22);
1119
1120             /* Calculate table index by multiplying r with table scale and truncate to integer */
1121             rt               = _mm_mul_ps(r22,vftabscale);
1122             vfitab           = _mm_cvttps_epi32(rt);
1123 #ifdef __XOP__
1124             vfeps            = _mm_frcz_ps(rt);
1125 #else
1126             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1127 #endif
1128             twovfeps         = _mm_add_ps(vfeps,vfeps);
1129             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1130
1131             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1132             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1133             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1134             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1135             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1136             _MM_TRANSPOSE4_PS(Y,F,G,H);
1137             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1138             VV               = _mm_macc_ps(vfeps,Fp,Y);
1139             velec            = _mm_mul_ps(qq22,VV);
1140             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1141             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1142
1143             /* Update potential sum for this i atom from the interaction with this j atom. */
1144             velec            = _mm_andnot_ps(dummy_mask,velec);
1145             velecsum         = _mm_add_ps(velecsum,velec);
1146
1147             fscal            = felec;
1148
1149             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1150
1151              /* Update vectorial force */
1152             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1153             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1154             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1155
1156             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1157             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1158             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1159
1160             /**************************
1161              * CALCULATE INTERACTIONS *
1162              **************************/
1163
1164             r23              = _mm_mul_ps(rsq23,rinv23);
1165             r23              = _mm_andnot_ps(dummy_mask,r23);
1166
1167             /* Calculate table index by multiplying r with table scale and truncate to integer */
1168             rt               = _mm_mul_ps(r23,vftabscale);
1169             vfitab           = _mm_cvttps_epi32(rt);
1170 #ifdef __XOP__
1171             vfeps            = _mm_frcz_ps(rt);
1172 #else
1173             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1174 #endif
1175             twovfeps         = _mm_add_ps(vfeps,vfeps);
1176             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1177
1178             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1179             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1180             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1181             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1182             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1183             _MM_TRANSPOSE4_PS(Y,F,G,H);
1184             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1185             VV               = _mm_macc_ps(vfeps,Fp,Y);
1186             velec            = _mm_mul_ps(qq23,VV);
1187             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1188             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1189
1190             /* Update potential sum for this i atom from the interaction with this j atom. */
1191             velec            = _mm_andnot_ps(dummy_mask,velec);
1192             velecsum         = _mm_add_ps(velecsum,velec);
1193
1194             fscal            = felec;
1195
1196             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1197
1198              /* Update vectorial force */
1199             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1200             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1201             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1202
1203             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1204             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1205             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1206
1207             /**************************
1208              * CALCULATE INTERACTIONS *
1209              **************************/
1210
1211             r31              = _mm_mul_ps(rsq31,rinv31);
1212             r31              = _mm_andnot_ps(dummy_mask,r31);
1213
1214             /* Calculate table index by multiplying r with table scale and truncate to integer */
1215             rt               = _mm_mul_ps(r31,vftabscale);
1216             vfitab           = _mm_cvttps_epi32(rt);
1217 #ifdef __XOP__
1218             vfeps            = _mm_frcz_ps(rt);
1219 #else
1220             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1221 #endif
1222             twovfeps         = _mm_add_ps(vfeps,vfeps);
1223             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1224
1225             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1226             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1227             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1228             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1229             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1230             _MM_TRANSPOSE4_PS(Y,F,G,H);
1231             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1232             VV               = _mm_macc_ps(vfeps,Fp,Y);
1233             velec            = _mm_mul_ps(qq31,VV);
1234             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1235             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1236
1237             /* Update potential sum for this i atom from the interaction with this j atom. */
1238             velec            = _mm_andnot_ps(dummy_mask,velec);
1239             velecsum         = _mm_add_ps(velecsum,velec);
1240
1241             fscal            = felec;
1242
1243             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1244
1245              /* Update vectorial force */
1246             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1247             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1248             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1249
1250             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1251             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1252             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1253
1254             /**************************
1255              * CALCULATE INTERACTIONS *
1256              **************************/
1257
1258             r32              = _mm_mul_ps(rsq32,rinv32);
1259             r32              = _mm_andnot_ps(dummy_mask,r32);
1260
1261             /* Calculate table index by multiplying r with table scale and truncate to integer */
1262             rt               = _mm_mul_ps(r32,vftabscale);
1263             vfitab           = _mm_cvttps_epi32(rt);
1264 #ifdef __XOP__
1265             vfeps            = _mm_frcz_ps(rt);
1266 #else
1267             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1268 #endif
1269             twovfeps         = _mm_add_ps(vfeps,vfeps);
1270             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1271
1272             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1273             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1274             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1275             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1276             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1277             _MM_TRANSPOSE4_PS(Y,F,G,H);
1278             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1279             VV               = _mm_macc_ps(vfeps,Fp,Y);
1280             velec            = _mm_mul_ps(qq32,VV);
1281             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1282             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1283
1284             /* Update potential sum for this i atom from the interaction with this j atom. */
1285             velec            = _mm_andnot_ps(dummy_mask,velec);
1286             velecsum         = _mm_add_ps(velecsum,velec);
1287
1288             fscal            = felec;
1289
1290             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1291
1292              /* Update vectorial force */
1293             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1294             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1295             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1296
1297             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1298             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1299             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1300
1301             /**************************
1302              * CALCULATE INTERACTIONS *
1303              **************************/
1304
1305             r33              = _mm_mul_ps(rsq33,rinv33);
1306             r33              = _mm_andnot_ps(dummy_mask,r33);
1307
1308             /* Calculate table index by multiplying r with table scale and truncate to integer */
1309             rt               = _mm_mul_ps(r33,vftabscale);
1310             vfitab           = _mm_cvttps_epi32(rt);
1311 #ifdef __XOP__
1312             vfeps            = _mm_frcz_ps(rt);
1313 #else
1314             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1315 #endif
1316             twovfeps         = _mm_add_ps(vfeps,vfeps);
1317             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1318
1319             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1320             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1321             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1322             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1323             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1324             _MM_TRANSPOSE4_PS(Y,F,G,H);
1325             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1326             VV               = _mm_macc_ps(vfeps,Fp,Y);
1327             velec            = _mm_mul_ps(qq33,VV);
1328             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1329             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1330
1331             /* Update potential sum for this i atom from the interaction with this j atom. */
1332             velec            = _mm_andnot_ps(dummy_mask,velec);
1333             velecsum         = _mm_add_ps(velecsum,velec);
1334
1335             fscal            = felec;
1336
1337             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1338
1339              /* Update vectorial force */
1340             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1341             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1342             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1343
1344             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1345             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1346             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1347
1348             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1349             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1350             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1351             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1352
1353             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1354                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1355                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1356
1357             /* Inner loop uses 486 flops */
1358         }
1359
1360         /* End of innermost loop */
1361
1362         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1363                                               f+i_coord_offset,fshift+i_shift_offset);
1364
1365         ggid                        = gid[iidx];
1366         /* Update potential energies */
1367         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1368         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1369
1370         /* Increment number of inner iterations */
1371         inneriter                  += j_index_end - j_index_start;
1372
1373         /* Outer loop uses 26 flops */
1374     }
1375
1376     /* Increment number of outer iterations */
1377     outeriter        += nri;
1378
1379     /* Update outer/inner flops */
1380
1381     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*486);
1382 }
1383 /*
1384  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_single
1385  * Electrostatics interaction: CubicSplineTable
1386  * VdW interaction:            CubicSplineTable
1387  * Geometry:                   Water4-Water4
1388  * Calculate force/pot:        Force
1389  */
1390 void
1391 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_128_fma_single
1392                     (t_nblist                    * gmx_restrict       nlist,
1393                      rvec                        * gmx_restrict          xx,
1394                      rvec                        * gmx_restrict          ff,
1395                      t_forcerec                  * gmx_restrict          fr,
1396                      t_mdatoms                   * gmx_restrict     mdatoms,
1397                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1398                      t_nrnb                      * gmx_restrict        nrnb)
1399 {
1400     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1401      * just 0 for non-waters.
1402      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1403      * jnr indices corresponding to data put in the four positions in the SIMD register.
1404      */
1405     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1406     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1407     int              jnrA,jnrB,jnrC,jnrD;
1408     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1409     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1410     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1411     real             rcutoff_scalar;
1412     real             *shiftvec,*fshift,*x,*f;
1413     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1414     real             scratch[4*DIM];
1415     __m128           fscal,rcutoff,rcutoff2,jidxall;
1416     int              vdwioffset0;
1417     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1418     int              vdwioffset1;
1419     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1420     int              vdwioffset2;
1421     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1422     int              vdwioffset3;
1423     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1424     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1425     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1426     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1427     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1428     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1429     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1430     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1431     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1432     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1433     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1434     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1435     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1436     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1437     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1438     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1439     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1440     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1441     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1442     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1443     real             *charge;
1444     int              nvdwtype;
1445     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1446     int              *vdwtype;
1447     real             *vdwparam;
1448     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1449     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1450     __m128i          vfitab;
1451     __m128i          ifour       = _mm_set1_epi32(4);
1452     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1453     real             *vftab;
1454     __m128           dummy_mask,cutoff_mask;
1455     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1456     __m128           one     = _mm_set1_ps(1.0);
1457     __m128           two     = _mm_set1_ps(2.0);
1458     x                = xx[0];
1459     f                = ff[0];
1460
1461     nri              = nlist->nri;
1462     iinr             = nlist->iinr;
1463     jindex           = nlist->jindex;
1464     jjnr             = nlist->jjnr;
1465     shiftidx         = nlist->shift;
1466     gid              = nlist->gid;
1467     shiftvec         = fr->shift_vec[0];
1468     fshift           = fr->fshift[0];
1469     facel            = _mm_set1_ps(fr->epsfac);
1470     charge           = mdatoms->chargeA;
1471     nvdwtype         = fr->ntype;
1472     vdwparam         = fr->nbfp;
1473     vdwtype          = mdatoms->typeA;
1474
1475     vftab            = kernel_data->table_elec_vdw->data;
1476     vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
1477
1478     /* Setup water-specific parameters */
1479     inr              = nlist->iinr[0];
1480     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1481     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1482     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1483     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1484
1485     jq1              = _mm_set1_ps(charge[inr+1]);
1486     jq2              = _mm_set1_ps(charge[inr+2]);
1487     jq3              = _mm_set1_ps(charge[inr+3]);
1488     vdwjidx0A        = 2*vdwtype[inr+0];
1489     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1490     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1491     qq11             = _mm_mul_ps(iq1,jq1);
1492     qq12             = _mm_mul_ps(iq1,jq2);
1493     qq13             = _mm_mul_ps(iq1,jq3);
1494     qq21             = _mm_mul_ps(iq2,jq1);
1495     qq22             = _mm_mul_ps(iq2,jq2);
1496     qq23             = _mm_mul_ps(iq2,jq3);
1497     qq31             = _mm_mul_ps(iq3,jq1);
1498     qq32             = _mm_mul_ps(iq3,jq2);
1499     qq33             = _mm_mul_ps(iq3,jq3);
1500
1501     /* Avoid stupid compiler warnings */
1502     jnrA = jnrB = jnrC = jnrD = 0;
1503     j_coord_offsetA = 0;
1504     j_coord_offsetB = 0;
1505     j_coord_offsetC = 0;
1506     j_coord_offsetD = 0;
1507
1508     outeriter        = 0;
1509     inneriter        = 0;
1510
1511     for(iidx=0;iidx<4*DIM;iidx++)
1512     {
1513         scratch[iidx] = 0.0;
1514     }
1515
1516     /* Start outer loop over neighborlists */
1517     for(iidx=0; iidx<nri; iidx++)
1518     {
1519         /* Load shift vector for this list */
1520         i_shift_offset   = DIM*shiftidx[iidx];
1521
1522         /* Load limits for loop over neighbors */
1523         j_index_start    = jindex[iidx];
1524         j_index_end      = jindex[iidx+1];
1525
1526         /* Get outer coordinate index */
1527         inr              = iinr[iidx];
1528         i_coord_offset   = DIM*inr;
1529
1530         /* Load i particle coords and add shift vector */
1531         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1532                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1533
1534         fix0             = _mm_setzero_ps();
1535         fiy0             = _mm_setzero_ps();
1536         fiz0             = _mm_setzero_ps();
1537         fix1             = _mm_setzero_ps();
1538         fiy1             = _mm_setzero_ps();
1539         fiz1             = _mm_setzero_ps();
1540         fix2             = _mm_setzero_ps();
1541         fiy2             = _mm_setzero_ps();
1542         fiz2             = _mm_setzero_ps();
1543         fix3             = _mm_setzero_ps();
1544         fiy3             = _mm_setzero_ps();
1545         fiz3             = _mm_setzero_ps();
1546
1547         /* Start inner kernel loop */
1548         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1549         {
1550
1551             /* Get j neighbor index, and coordinate index */
1552             jnrA             = jjnr[jidx];
1553             jnrB             = jjnr[jidx+1];
1554             jnrC             = jjnr[jidx+2];
1555             jnrD             = jjnr[jidx+3];
1556             j_coord_offsetA  = DIM*jnrA;
1557             j_coord_offsetB  = DIM*jnrB;
1558             j_coord_offsetC  = DIM*jnrC;
1559             j_coord_offsetD  = DIM*jnrD;
1560
1561             /* load j atom coordinates */
1562             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1563                                               x+j_coord_offsetC,x+j_coord_offsetD,
1564                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1565                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1566
1567             /* Calculate displacement vector */
1568             dx00             = _mm_sub_ps(ix0,jx0);
1569             dy00             = _mm_sub_ps(iy0,jy0);
1570             dz00             = _mm_sub_ps(iz0,jz0);
1571             dx11             = _mm_sub_ps(ix1,jx1);
1572             dy11             = _mm_sub_ps(iy1,jy1);
1573             dz11             = _mm_sub_ps(iz1,jz1);
1574             dx12             = _mm_sub_ps(ix1,jx2);
1575             dy12             = _mm_sub_ps(iy1,jy2);
1576             dz12             = _mm_sub_ps(iz1,jz2);
1577             dx13             = _mm_sub_ps(ix1,jx3);
1578             dy13             = _mm_sub_ps(iy1,jy3);
1579             dz13             = _mm_sub_ps(iz1,jz3);
1580             dx21             = _mm_sub_ps(ix2,jx1);
1581             dy21             = _mm_sub_ps(iy2,jy1);
1582             dz21             = _mm_sub_ps(iz2,jz1);
1583             dx22             = _mm_sub_ps(ix2,jx2);
1584             dy22             = _mm_sub_ps(iy2,jy2);
1585             dz22             = _mm_sub_ps(iz2,jz2);
1586             dx23             = _mm_sub_ps(ix2,jx3);
1587             dy23             = _mm_sub_ps(iy2,jy3);
1588             dz23             = _mm_sub_ps(iz2,jz3);
1589             dx31             = _mm_sub_ps(ix3,jx1);
1590             dy31             = _mm_sub_ps(iy3,jy1);
1591             dz31             = _mm_sub_ps(iz3,jz1);
1592             dx32             = _mm_sub_ps(ix3,jx2);
1593             dy32             = _mm_sub_ps(iy3,jy2);
1594             dz32             = _mm_sub_ps(iz3,jz2);
1595             dx33             = _mm_sub_ps(ix3,jx3);
1596             dy33             = _mm_sub_ps(iy3,jy3);
1597             dz33             = _mm_sub_ps(iz3,jz3);
1598
1599             /* Calculate squared distance and things based on it */
1600             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1601             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1602             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1603             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1604             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1605             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1606             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1607             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1608             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1609             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1610
1611             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1612             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1613             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1614             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1615             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1616             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1617             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1618             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1619             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1620             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1621
1622             fjx0             = _mm_setzero_ps();
1623             fjy0             = _mm_setzero_ps();
1624             fjz0             = _mm_setzero_ps();
1625             fjx1             = _mm_setzero_ps();
1626             fjy1             = _mm_setzero_ps();
1627             fjz1             = _mm_setzero_ps();
1628             fjx2             = _mm_setzero_ps();
1629             fjy2             = _mm_setzero_ps();
1630             fjz2             = _mm_setzero_ps();
1631             fjx3             = _mm_setzero_ps();
1632             fjy3             = _mm_setzero_ps();
1633             fjz3             = _mm_setzero_ps();
1634
1635             /**************************
1636              * CALCULATE INTERACTIONS *
1637              **************************/
1638
1639             r00              = _mm_mul_ps(rsq00,rinv00);
1640
1641             /* Calculate table index by multiplying r with table scale and truncate to integer */
1642             rt               = _mm_mul_ps(r00,vftabscale);
1643             vfitab           = _mm_cvttps_epi32(rt);
1644 #ifdef __XOP__
1645             vfeps            = _mm_frcz_ps(rt);
1646 #else
1647             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1648 #endif
1649             twovfeps         = _mm_add_ps(vfeps,vfeps);
1650             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1651
1652             /* CUBIC SPLINE TABLE DISPERSION */
1653             vfitab           = _mm_add_epi32(vfitab,ifour);
1654             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1655             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1656             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1657             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1658             _MM_TRANSPOSE4_PS(Y,F,G,H);
1659             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1660             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1661             fvdw6            = _mm_mul_ps(c6_00,FF);
1662
1663             /* CUBIC SPLINE TABLE REPULSION */
1664             vfitab           = _mm_add_epi32(vfitab,ifour);
1665             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1666             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1667             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1668             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1669             _MM_TRANSPOSE4_PS(Y,F,G,H);
1670             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1671             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1672             fvdw12           = _mm_mul_ps(c12_00,FF);
1673             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1674
1675             fscal            = fvdw;
1676
1677              /* Update vectorial force */
1678             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1679             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1680             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1681
1682             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1683             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1684             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1685
1686             /**************************
1687              * CALCULATE INTERACTIONS *
1688              **************************/
1689
1690             r11              = _mm_mul_ps(rsq11,rinv11);
1691
1692             /* Calculate table index by multiplying r with table scale and truncate to integer */
1693             rt               = _mm_mul_ps(r11,vftabscale);
1694             vfitab           = _mm_cvttps_epi32(rt);
1695 #ifdef __XOP__
1696             vfeps            = _mm_frcz_ps(rt);
1697 #else
1698             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1699 #endif
1700             twovfeps         = _mm_add_ps(vfeps,vfeps);
1701             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1702
1703             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1704             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1705             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1706             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1707             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1708             _MM_TRANSPOSE4_PS(Y,F,G,H);
1709             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1710             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1711             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1712
1713             fscal            = felec;
1714
1715              /* Update vectorial force */
1716             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1717             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1718             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1719
1720             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1721             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1722             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1723
1724             /**************************
1725              * CALCULATE INTERACTIONS *
1726              **************************/
1727
1728             r12              = _mm_mul_ps(rsq12,rinv12);
1729
1730             /* Calculate table index by multiplying r with table scale and truncate to integer */
1731             rt               = _mm_mul_ps(r12,vftabscale);
1732             vfitab           = _mm_cvttps_epi32(rt);
1733 #ifdef __XOP__
1734             vfeps            = _mm_frcz_ps(rt);
1735 #else
1736             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1737 #endif
1738             twovfeps         = _mm_add_ps(vfeps,vfeps);
1739             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1740
1741             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1742             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1743             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1744             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1745             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1746             _MM_TRANSPOSE4_PS(Y,F,G,H);
1747             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1748             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1749             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1750
1751             fscal            = felec;
1752
1753              /* Update vectorial force */
1754             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1755             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1756             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1757
1758             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1759             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1760             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1761
1762             /**************************
1763              * CALCULATE INTERACTIONS *
1764              **************************/
1765
1766             r13              = _mm_mul_ps(rsq13,rinv13);
1767
1768             /* Calculate table index by multiplying r with table scale and truncate to integer */
1769             rt               = _mm_mul_ps(r13,vftabscale);
1770             vfitab           = _mm_cvttps_epi32(rt);
1771 #ifdef __XOP__
1772             vfeps            = _mm_frcz_ps(rt);
1773 #else
1774             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1775 #endif
1776             twovfeps         = _mm_add_ps(vfeps,vfeps);
1777             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1778
1779             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1780             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1781             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1782             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1783             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1784             _MM_TRANSPOSE4_PS(Y,F,G,H);
1785             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1786             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1787             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1788
1789             fscal            = felec;
1790
1791              /* Update vectorial force */
1792             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1793             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1794             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1795
1796             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1797             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1798             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1799
1800             /**************************
1801              * CALCULATE INTERACTIONS *
1802              **************************/
1803
1804             r21              = _mm_mul_ps(rsq21,rinv21);
1805
1806             /* Calculate table index by multiplying r with table scale and truncate to integer */
1807             rt               = _mm_mul_ps(r21,vftabscale);
1808             vfitab           = _mm_cvttps_epi32(rt);
1809 #ifdef __XOP__
1810             vfeps            = _mm_frcz_ps(rt);
1811 #else
1812             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1813 #endif
1814             twovfeps         = _mm_add_ps(vfeps,vfeps);
1815             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1816
1817             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1818             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1819             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1820             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1821             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1822             _MM_TRANSPOSE4_PS(Y,F,G,H);
1823             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1824             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1825             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1826
1827             fscal            = felec;
1828
1829              /* Update vectorial force */
1830             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1831             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1832             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1833
1834             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1835             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1836             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1837
1838             /**************************
1839              * CALCULATE INTERACTIONS *
1840              **************************/
1841
1842             r22              = _mm_mul_ps(rsq22,rinv22);
1843
1844             /* Calculate table index by multiplying r with table scale and truncate to integer */
1845             rt               = _mm_mul_ps(r22,vftabscale);
1846             vfitab           = _mm_cvttps_epi32(rt);
1847 #ifdef __XOP__
1848             vfeps            = _mm_frcz_ps(rt);
1849 #else
1850             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1851 #endif
1852             twovfeps         = _mm_add_ps(vfeps,vfeps);
1853             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1854
1855             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1856             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1857             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1858             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1859             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1860             _MM_TRANSPOSE4_PS(Y,F,G,H);
1861             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1862             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1863             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1864
1865             fscal            = felec;
1866
1867              /* Update vectorial force */
1868             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1869             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1870             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1871
1872             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1873             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1874             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1875
1876             /**************************
1877              * CALCULATE INTERACTIONS *
1878              **************************/
1879
1880             r23              = _mm_mul_ps(rsq23,rinv23);
1881
1882             /* Calculate table index by multiplying r with table scale and truncate to integer */
1883             rt               = _mm_mul_ps(r23,vftabscale);
1884             vfitab           = _mm_cvttps_epi32(rt);
1885 #ifdef __XOP__
1886             vfeps            = _mm_frcz_ps(rt);
1887 #else
1888             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1889 #endif
1890             twovfeps         = _mm_add_ps(vfeps,vfeps);
1891             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1892
1893             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1894             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1895             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1896             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1897             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1898             _MM_TRANSPOSE4_PS(Y,F,G,H);
1899             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1900             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1901             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1902
1903             fscal            = felec;
1904
1905              /* Update vectorial force */
1906             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1907             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1908             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1909
1910             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1911             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1912             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1913
1914             /**************************
1915              * CALCULATE INTERACTIONS *
1916              **************************/
1917
1918             r31              = _mm_mul_ps(rsq31,rinv31);
1919
1920             /* Calculate table index by multiplying r with table scale and truncate to integer */
1921             rt               = _mm_mul_ps(r31,vftabscale);
1922             vfitab           = _mm_cvttps_epi32(rt);
1923 #ifdef __XOP__
1924             vfeps            = _mm_frcz_ps(rt);
1925 #else
1926             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1927 #endif
1928             twovfeps         = _mm_add_ps(vfeps,vfeps);
1929             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1930
1931             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1932             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1933             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1934             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1935             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1936             _MM_TRANSPOSE4_PS(Y,F,G,H);
1937             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1938             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1939             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1940
1941             fscal            = felec;
1942
1943              /* Update vectorial force */
1944             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1945             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1946             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1947
1948             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1949             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1950             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1951
1952             /**************************
1953              * CALCULATE INTERACTIONS *
1954              **************************/
1955
1956             r32              = _mm_mul_ps(rsq32,rinv32);
1957
1958             /* Calculate table index by multiplying r with table scale and truncate to integer */
1959             rt               = _mm_mul_ps(r32,vftabscale);
1960             vfitab           = _mm_cvttps_epi32(rt);
1961 #ifdef __XOP__
1962             vfeps            = _mm_frcz_ps(rt);
1963 #else
1964             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1965 #endif
1966             twovfeps         = _mm_add_ps(vfeps,vfeps);
1967             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1968
1969             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1970             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1971             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1972             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1973             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1974             _MM_TRANSPOSE4_PS(Y,F,G,H);
1975             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1976             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1977             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1978
1979             fscal            = felec;
1980
1981              /* Update vectorial force */
1982             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1983             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1984             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1985
1986             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1987             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1988             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1989
1990             /**************************
1991              * CALCULATE INTERACTIONS *
1992              **************************/
1993
1994             r33              = _mm_mul_ps(rsq33,rinv33);
1995
1996             /* Calculate table index by multiplying r with table scale and truncate to integer */
1997             rt               = _mm_mul_ps(r33,vftabscale);
1998             vfitab           = _mm_cvttps_epi32(rt);
1999 #ifdef __XOP__
2000             vfeps            = _mm_frcz_ps(rt);
2001 #else
2002             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2003 #endif
2004             twovfeps         = _mm_add_ps(vfeps,vfeps);
2005             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2006
2007             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2008             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2009             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2010             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2011             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2012             _MM_TRANSPOSE4_PS(Y,F,G,H);
2013             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2014             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2015             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2016
2017             fscal            = felec;
2018
2019              /* Update vectorial force */
2020             fix3             = _mm_macc_ps(dx33,fscal,fix3);
2021             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
2022             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
2023
2024             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
2025             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
2026             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
2027
2028             fjptrA             = f+j_coord_offsetA;
2029             fjptrB             = f+j_coord_offsetB;
2030             fjptrC             = f+j_coord_offsetC;
2031             fjptrD             = f+j_coord_offsetD;
2032
2033             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2034                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2035                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2036
2037             /* Inner loop uses 432 flops */
2038         }
2039
2040         if(jidx<j_index_end)
2041         {
2042
2043             /* Get j neighbor index, and coordinate index */
2044             jnrlistA         = jjnr[jidx];
2045             jnrlistB         = jjnr[jidx+1];
2046             jnrlistC         = jjnr[jidx+2];
2047             jnrlistD         = jjnr[jidx+3];
2048             /* Sign of each element will be negative for non-real atoms.
2049              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2050              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2051              */
2052             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2053             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2054             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2055             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2056             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2057             j_coord_offsetA  = DIM*jnrA;
2058             j_coord_offsetB  = DIM*jnrB;
2059             j_coord_offsetC  = DIM*jnrC;
2060             j_coord_offsetD  = DIM*jnrD;
2061
2062             /* load j atom coordinates */
2063             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2064                                               x+j_coord_offsetC,x+j_coord_offsetD,
2065                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2066                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2067
2068             /* Calculate displacement vector */
2069             dx00             = _mm_sub_ps(ix0,jx0);
2070             dy00             = _mm_sub_ps(iy0,jy0);
2071             dz00             = _mm_sub_ps(iz0,jz0);
2072             dx11             = _mm_sub_ps(ix1,jx1);
2073             dy11             = _mm_sub_ps(iy1,jy1);
2074             dz11             = _mm_sub_ps(iz1,jz1);
2075             dx12             = _mm_sub_ps(ix1,jx2);
2076             dy12             = _mm_sub_ps(iy1,jy2);
2077             dz12             = _mm_sub_ps(iz1,jz2);
2078             dx13             = _mm_sub_ps(ix1,jx3);
2079             dy13             = _mm_sub_ps(iy1,jy3);
2080             dz13             = _mm_sub_ps(iz1,jz3);
2081             dx21             = _mm_sub_ps(ix2,jx1);
2082             dy21             = _mm_sub_ps(iy2,jy1);
2083             dz21             = _mm_sub_ps(iz2,jz1);
2084             dx22             = _mm_sub_ps(ix2,jx2);
2085             dy22             = _mm_sub_ps(iy2,jy2);
2086             dz22             = _mm_sub_ps(iz2,jz2);
2087             dx23             = _mm_sub_ps(ix2,jx3);
2088             dy23             = _mm_sub_ps(iy2,jy3);
2089             dz23             = _mm_sub_ps(iz2,jz3);
2090             dx31             = _mm_sub_ps(ix3,jx1);
2091             dy31             = _mm_sub_ps(iy3,jy1);
2092             dz31             = _mm_sub_ps(iz3,jz1);
2093             dx32             = _mm_sub_ps(ix3,jx2);
2094             dy32             = _mm_sub_ps(iy3,jy2);
2095             dz32             = _mm_sub_ps(iz3,jz2);
2096             dx33             = _mm_sub_ps(ix3,jx3);
2097             dy33             = _mm_sub_ps(iy3,jy3);
2098             dz33             = _mm_sub_ps(iz3,jz3);
2099
2100             /* Calculate squared distance and things based on it */
2101             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2102             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2103             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2104             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2105             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2106             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2107             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2108             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2109             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2110             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2111
2112             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2113             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2114             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2115             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2116             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2117             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2118             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2119             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2120             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2121             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2122
2123             fjx0             = _mm_setzero_ps();
2124             fjy0             = _mm_setzero_ps();
2125             fjz0             = _mm_setzero_ps();
2126             fjx1             = _mm_setzero_ps();
2127             fjy1             = _mm_setzero_ps();
2128             fjz1             = _mm_setzero_ps();
2129             fjx2             = _mm_setzero_ps();
2130             fjy2             = _mm_setzero_ps();
2131             fjz2             = _mm_setzero_ps();
2132             fjx3             = _mm_setzero_ps();
2133             fjy3             = _mm_setzero_ps();
2134             fjz3             = _mm_setzero_ps();
2135
2136             /**************************
2137              * CALCULATE INTERACTIONS *
2138              **************************/
2139
2140             r00              = _mm_mul_ps(rsq00,rinv00);
2141             r00              = _mm_andnot_ps(dummy_mask,r00);
2142
2143             /* Calculate table index by multiplying r with table scale and truncate to integer */
2144             rt               = _mm_mul_ps(r00,vftabscale);
2145             vfitab           = _mm_cvttps_epi32(rt);
2146 #ifdef __XOP__
2147             vfeps            = _mm_frcz_ps(rt);
2148 #else
2149             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2150 #endif
2151             twovfeps         = _mm_add_ps(vfeps,vfeps);
2152             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2153
2154             /* CUBIC SPLINE TABLE DISPERSION */
2155             vfitab           = _mm_add_epi32(vfitab,ifour);
2156             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2157             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2158             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2159             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2160             _MM_TRANSPOSE4_PS(Y,F,G,H);
2161             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2162             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2163             fvdw6            = _mm_mul_ps(c6_00,FF);
2164
2165             /* CUBIC SPLINE TABLE REPULSION */
2166             vfitab           = _mm_add_epi32(vfitab,ifour);
2167             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2168             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2169             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2170             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2171             _MM_TRANSPOSE4_PS(Y,F,G,H);
2172             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2173             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2174             fvdw12           = _mm_mul_ps(c12_00,FF);
2175             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2176
2177             fscal            = fvdw;
2178
2179             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2180
2181              /* Update vectorial force */
2182             fix0             = _mm_macc_ps(dx00,fscal,fix0);
2183             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
2184             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
2185
2186             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
2187             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
2188             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
2189
2190             /**************************
2191              * CALCULATE INTERACTIONS *
2192              **************************/
2193
2194             r11              = _mm_mul_ps(rsq11,rinv11);
2195             r11              = _mm_andnot_ps(dummy_mask,r11);
2196
2197             /* Calculate table index by multiplying r with table scale and truncate to integer */
2198             rt               = _mm_mul_ps(r11,vftabscale);
2199             vfitab           = _mm_cvttps_epi32(rt);
2200 #ifdef __XOP__
2201             vfeps            = _mm_frcz_ps(rt);
2202 #else
2203             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2204 #endif
2205             twovfeps         = _mm_add_ps(vfeps,vfeps);
2206             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2207
2208             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2209             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2210             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2211             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2212             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2213             _MM_TRANSPOSE4_PS(Y,F,G,H);
2214             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2215             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2216             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2217
2218             fscal            = felec;
2219
2220             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2221
2222              /* Update vectorial force */
2223             fix1             = _mm_macc_ps(dx11,fscal,fix1);
2224             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
2225             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
2226
2227             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
2228             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
2229             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
2230
2231             /**************************
2232              * CALCULATE INTERACTIONS *
2233              **************************/
2234
2235             r12              = _mm_mul_ps(rsq12,rinv12);
2236             r12              = _mm_andnot_ps(dummy_mask,r12);
2237
2238             /* Calculate table index by multiplying r with table scale and truncate to integer */
2239             rt               = _mm_mul_ps(r12,vftabscale);
2240             vfitab           = _mm_cvttps_epi32(rt);
2241 #ifdef __XOP__
2242             vfeps            = _mm_frcz_ps(rt);
2243 #else
2244             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2245 #endif
2246             twovfeps         = _mm_add_ps(vfeps,vfeps);
2247             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2248
2249             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2250             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2251             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2252             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2253             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2254             _MM_TRANSPOSE4_PS(Y,F,G,H);
2255             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2256             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2257             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2258
2259             fscal            = felec;
2260
2261             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2262
2263              /* Update vectorial force */
2264             fix1             = _mm_macc_ps(dx12,fscal,fix1);
2265             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
2266             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
2267
2268             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
2269             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
2270             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
2271
2272             /**************************
2273              * CALCULATE INTERACTIONS *
2274              **************************/
2275
2276             r13              = _mm_mul_ps(rsq13,rinv13);
2277             r13              = _mm_andnot_ps(dummy_mask,r13);
2278
2279             /* Calculate table index by multiplying r with table scale and truncate to integer */
2280             rt               = _mm_mul_ps(r13,vftabscale);
2281             vfitab           = _mm_cvttps_epi32(rt);
2282 #ifdef __XOP__
2283             vfeps            = _mm_frcz_ps(rt);
2284 #else
2285             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2286 #endif
2287             twovfeps         = _mm_add_ps(vfeps,vfeps);
2288             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2289
2290             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2291             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2292             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2293             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2294             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2295             _MM_TRANSPOSE4_PS(Y,F,G,H);
2296             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2297             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2298             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
2299
2300             fscal            = felec;
2301
2302             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2303
2304              /* Update vectorial force */
2305             fix1             = _mm_macc_ps(dx13,fscal,fix1);
2306             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
2307             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
2308
2309             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
2310             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
2311             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
2312
2313             /**************************
2314              * CALCULATE INTERACTIONS *
2315              **************************/
2316
2317             r21              = _mm_mul_ps(rsq21,rinv21);
2318             r21              = _mm_andnot_ps(dummy_mask,r21);
2319
2320             /* Calculate table index by multiplying r with table scale and truncate to integer */
2321             rt               = _mm_mul_ps(r21,vftabscale);
2322             vfitab           = _mm_cvttps_epi32(rt);
2323 #ifdef __XOP__
2324             vfeps            = _mm_frcz_ps(rt);
2325 #else
2326             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2327 #endif
2328             twovfeps         = _mm_add_ps(vfeps,vfeps);
2329             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2330
2331             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2332             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2333             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2334             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2335             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2336             _MM_TRANSPOSE4_PS(Y,F,G,H);
2337             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2338             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2339             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2340
2341             fscal            = felec;
2342
2343             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2344
2345              /* Update vectorial force */
2346             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2347             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2348             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2349
2350             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2351             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2352             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2353
2354             /**************************
2355              * CALCULATE INTERACTIONS *
2356              **************************/
2357
2358             r22              = _mm_mul_ps(rsq22,rinv22);
2359             r22              = _mm_andnot_ps(dummy_mask,r22);
2360
2361             /* Calculate table index by multiplying r with table scale and truncate to integer */
2362             rt               = _mm_mul_ps(r22,vftabscale);
2363             vfitab           = _mm_cvttps_epi32(rt);
2364 #ifdef __XOP__
2365             vfeps            = _mm_frcz_ps(rt);
2366 #else
2367             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2368 #endif
2369             twovfeps         = _mm_add_ps(vfeps,vfeps);
2370             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2371
2372             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2373             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2374             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2375             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2376             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2377             _MM_TRANSPOSE4_PS(Y,F,G,H);
2378             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2379             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2380             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2381
2382             fscal            = felec;
2383
2384             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2385
2386              /* Update vectorial force */
2387             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2388             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2389             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2390
2391             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2392             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2393             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2394
2395             /**************************
2396              * CALCULATE INTERACTIONS *
2397              **************************/
2398
2399             r23              = _mm_mul_ps(rsq23,rinv23);
2400             r23              = _mm_andnot_ps(dummy_mask,r23);
2401
2402             /* Calculate table index by multiplying r with table scale and truncate to integer */
2403             rt               = _mm_mul_ps(r23,vftabscale);
2404             vfitab           = _mm_cvttps_epi32(rt);
2405 #ifdef __XOP__
2406             vfeps            = _mm_frcz_ps(rt);
2407 #else
2408             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2409 #endif
2410             twovfeps         = _mm_add_ps(vfeps,vfeps);
2411             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2412
2413             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2414             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2415             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2416             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2417             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2418             _MM_TRANSPOSE4_PS(Y,F,G,H);
2419             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2420             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2421             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
2422
2423             fscal            = felec;
2424
2425             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2426
2427              /* Update vectorial force */
2428             fix2             = _mm_macc_ps(dx23,fscal,fix2);
2429             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
2430             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
2431
2432             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
2433             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
2434             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
2435
2436             /**************************
2437              * CALCULATE INTERACTIONS *
2438              **************************/
2439
2440             r31              = _mm_mul_ps(rsq31,rinv31);
2441             r31              = _mm_andnot_ps(dummy_mask,r31);
2442
2443             /* Calculate table index by multiplying r with table scale and truncate to integer */
2444             rt               = _mm_mul_ps(r31,vftabscale);
2445             vfitab           = _mm_cvttps_epi32(rt);
2446 #ifdef __XOP__
2447             vfeps            = _mm_frcz_ps(rt);
2448 #else
2449             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2450 #endif
2451             twovfeps         = _mm_add_ps(vfeps,vfeps);
2452             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2453
2454             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2455             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2456             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2457             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2458             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2459             _MM_TRANSPOSE4_PS(Y,F,G,H);
2460             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2461             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2462             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
2463
2464             fscal            = felec;
2465
2466             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2467
2468              /* Update vectorial force */
2469             fix3             = _mm_macc_ps(dx31,fscal,fix3);
2470             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
2471             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
2472
2473             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
2474             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
2475             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
2476
2477             /**************************
2478              * CALCULATE INTERACTIONS *
2479              **************************/
2480
2481             r32              = _mm_mul_ps(rsq32,rinv32);
2482             r32              = _mm_andnot_ps(dummy_mask,r32);
2483
2484             /* Calculate table index by multiplying r with table scale and truncate to integer */
2485             rt               = _mm_mul_ps(r32,vftabscale);
2486             vfitab           = _mm_cvttps_epi32(rt);
2487 #ifdef __XOP__
2488             vfeps            = _mm_frcz_ps(rt);
2489 #else
2490             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2491 #endif
2492             twovfeps         = _mm_add_ps(vfeps,vfeps);
2493             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2494
2495             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2496             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2497             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2498             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2499             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2500             _MM_TRANSPOSE4_PS(Y,F,G,H);
2501             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2502             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2503             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
2504
2505             fscal            = felec;
2506
2507             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2508
2509              /* Update vectorial force */
2510             fix3             = _mm_macc_ps(dx32,fscal,fix3);
2511             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
2512             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
2513
2514             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
2515             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
2516             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
2517
2518             /**************************
2519              * CALCULATE INTERACTIONS *
2520              **************************/
2521
2522             r33              = _mm_mul_ps(rsq33,rinv33);
2523             r33              = _mm_andnot_ps(dummy_mask,r33);
2524
2525             /* Calculate table index by multiplying r with table scale and truncate to integer */
2526             rt               = _mm_mul_ps(r33,vftabscale);
2527             vfitab           = _mm_cvttps_epi32(rt);
2528 #ifdef __XOP__
2529             vfeps            = _mm_frcz_ps(rt);
2530 #else
2531             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2532 #endif
2533             twovfeps         = _mm_add_ps(vfeps,vfeps);
2534             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2535
2536             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2537             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2538             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2539             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2540             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2541             _MM_TRANSPOSE4_PS(Y,F,G,H);
2542             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2543             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2544             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2545
2546             fscal            = felec;
2547
2548             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2549
2550              /* Update vectorial force */
2551             fix3             = _mm_macc_ps(dx33,fscal,fix3);
2552             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
2553             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
2554
2555             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
2556             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
2557             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
2558
2559             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2560             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2561             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2562             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2563
2564             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2565                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2566                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2567
2568             /* Inner loop uses 442 flops */
2569         }
2570
2571         /* End of innermost loop */
2572
2573         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2574                                               f+i_coord_offset,fshift+i_shift_offset);
2575
2576         /* Increment number of inner iterations */
2577         inneriter                  += j_index_end - j_index_start;
2578
2579         /* Outer loop uses 24 flops */
2580     }
2581
2582     /* Increment number of outer iterations */
2583     outeriter        += nri;
2584
2585     /* Update outer/inner flops */
2586
2587     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*442);
2588 }