7d62483b5966aecb6211b181df6ed68705429121
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gmx_math_x86_avx_128_fma_single.h"
50 #include "kernelutil_x86_avx_128_fma_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_128_fma_single
54  * Electrostatics interaction: CubicSplineTable
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water3-Water3
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_128_fma_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset0;
86     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     int              vdwioffset1;
88     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     int              vdwioffset2;
90     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     int              nvdwtype;
109     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110     int              *vdwtype;
111     real             *vdwparam;
112     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
113     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
114     __m128i          vfitab;
115     __m128i          ifour       = _mm_set1_epi32(4);
116     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
117     real             *vftab;
118     __m128           dummy_mask,cutoff_mask;
119     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
120     __m128           one     = _mm_set1_ps(1.0);
121     __m128           two     = _mm_set1_ps(2.0);
122     x                = xx[0];
123     f                = ff[0];
124
125     nri              = nlist->nri;
126     iinr             = nlist->iinr;
127     jindex           = nlist->jindex;
128     jjnr             = nlist->jjnr;
129     shiftidx         = nlist->shift;
130     gid              = nlist->gid;
131     shiftvec         = fr->shift_vec[0];
132     fshift           = fr->fshift[0];
133     facel            = _mm_set1_ps(fr->epsfac);
134     charge           = mdatoms->chargeA;
135     nvdwtype         = fr->ntype;
136     vdwparam         = fr->nbfp;
137     vdwtype          = mdatoms->typeA;
138
139     vftab            = kernel_data->table_elec->data;
140     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
141
142     /* Setup water-specific parameters */
143     inr              = nlist->iinr[0];
144     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
145     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
146     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
147     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
148
149     jq0              = _mm_set1_ps(charge[inr+0]);
150     jq1              = _mm_set1_ps(charge[inr+1]);
151     jq2              = _mm_set1_ps(charge[inr+2]);
152     vdwjidx0A        = 2*vdwtype[inr+0];
153     qq00             = _mm_mul_ps(iq0,jq0);
154     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
155     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
156     qq01             = _mm_mul_ps(iq0,jq1);
157     qq02             = _mm_mul_ps(iq0,jq2);
158     qq10             = _mm_mul_ps(iq1,jq0);
159     qq11             = _mm_mul_ps(iq1,jq1);
160     qq12             = _mm_mul_ps(iq1,jq2);
161     qq20             = _mm_mul_ps(iq2,jq0);
162     qq21             = _mm_mul_ps(iq2,jq1);
163     qq22             = _mm_mul_ps(iq2,jq2);
164
165     /* Avoid stupid compiler warnings */
166     jnrA = jnrB = jnrC = jnrD = 0;
167     j_coord_offsetA = 0;
168     j_coord_offsetB = 0;
169     j_coord_offsetC = 0;
170     j_coord_offsetD = 0;
171
172     outeriter        = 0;
173     inneriter        = 0;
174
175     for(iidx=0;iidx<4*DIM;iidx++)
176     {
177         scratch[iidx] = 0.0;
178     }
179
180     /* Start outer loop over neighborlists */
181     for(iidx=0; iidx<nri; iidx++)
182     {
183         /* Load shift vector for this list */
184         i_shift_offset   = DIM*shiftidx[iidx];
185
186         /* Load limits for loop over neighbors */
187         j_index_start    = jindex[iidx];
188         j_index_end      = jindex[iidx+1];
189
190         /* Get outer coordinate index */
191         inr              = iinr[iidx];
192         i_coord_offset   = DIM*inr;
193
194         /* Load i particle coords and add shift vector */
195         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
196                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
197
198         fix0             = _mm_setzero_ps();
199         fiy0             = _mm_setzero_ps();
200         fiz0             = _mm_setzero_ps();
201         fix1             = _mm_setzero_ps();
202         fiy1             = _mm_setzero_ps();
203         fiz1             = _mm_setzero_ps();
204         fix2             = _mm_setzero_ps();
205         fiy2             = _mm_setzero_ps();
206         fiz2             = _mm_setzero_ps();
207
208         /* Reset potential sums */
209         velecsum         = _mm_setzero_ps();
210         vvdwsum          = _mm_setzero_ps();
211
212         /* Start inner kernel loop */
213         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
214         {
215
216             /* Get j neighbor index, and coordinate index */
217             jnrA             = jjnr[jidx];
218             jnrB             = jjnr[jidx+1];
219             jnrC             = jjnr[jidx+2];
220             jnrD             = jjnr[jidx+3];
221             j_coord_offsetA  = DIM*jnrA;
222             j_coord_offsetB  = DIM*jnrB;
223             j_coord_offsetC  = DIM*jnrC;
224             j_coord_offsetD  = DIM*jnrD;
225
226             /* load j atom coordinates */
227             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
228                                               x+j_coord_offsetC,x+j_coord_offsetD,
229                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
230
231             /* Calculate displacement vector */
232             dx00             = _mm_sub_ps(ix0,jx0);
233             dy00             = _mm_sub_ps(iy0,jy0);
234             dz00             = _mm_sub_ps(iz0,jz0);
235             dx01             = _mm_sub_ps(ix0,jx1);
236             dy01             = _mm_sub_ps(iy0,jy1);
237             dz01             = _mm_sub_ps(iz0,jz1);
238             dx02             = _mm_sub_ps(ix0,jx2);
239             dy02             = _mm_sub_ps(iy0,jy2);
240             dz02             = _mm_sub_ps(iz0,jz2);
241             dx10             = _mm_sub_ps(ix1,jx0);
242             dy10             = _mm_sub_ps(iy1,jy0);
243             dz10             = _mm_sub_ps(iz1,jz0);
244             dx11             = _mm_sub_ps(ix1,jx1);
245             dy11             = _mm_sub_ps(iy1,jy1);
246             dz11             = _mm_sub_ps(iz1,jz1);
247             dx12             = _mm_sub_ps(ix1,jx2);
248             dy12             = _mm_sub_ps(iy1,jy2);
249             dz12             = _mm_sub_ps(iz1,jz2);
250             dx20             = _mm_sub_ps(ix2,jx0);
251             dy20             = _mm_sub_ps(iy2,jy0);
252             dz20             = _mm_sub_ps(iz2,jz0);
253             dx21             = _mm_sub_ps(ix2,jx1);
254             dy21             = _mm_sub_ps(iy2,jy1);
255             dz21             = _mm_sub_ps(iz2,jz1);
256             dx22             = _mm_sub_ps(ix2,jx2);
257             dy22             = _mm_sub_ps(iy2,jy2);
258             dz22             = _mm_sub_ps(iz2,jz2);
259
260             /* Calculate squared distance and things based on it */
261             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
262             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
263             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
264             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
265             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
266             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
267             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
268             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
269             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
270
271             rinv00           = gmx_mm_invsqrt_ps(rsq00);
272             rinv01           = gmx_mm_invsqrt_ps(rsq01);
273             rinv02           = gmx_mm_invsqrt_ps(rsq02);
274             rinv10           = gmx_mm_invsqrt_ps(rsq10);
275             rinv11           = gmx_mm_invsqrt_ps(rsq11);
276             rinv12           = gmx_mm_invsqrt_ps(rsq12);
277             rinv20           = gmx_mm_invsqrt_ps(rsq20);
278             rinv21           = gmx_mm_invsqrt_ps(rsq21);
279             rinv22           = gmx_mm_invsqrt_ps(rsq22);
280
281             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
282
283             fjx0             = _mm_setzero_ps();
284             fjy0             = _mm_setzero_ps();
285             fjz0             = _mm_setzero_ps();
286             fjx1             = _mm_setzero_ps();
287             fjy1             = _mm_setzero_ps();
288             fjz1             = _mm_setzero_ps();
289             fjx2             = _mm_setzero_ps();
290             fjy2             = _mm_setzero_ps();
291             fjz2             = _mm_setzero_ps();
292
293             /**************************
294              * CALCULATE INTERACTIONS *
295              **************************/
296
297             r00              = _mm_mul_ps(rsq00,rinv00);
298
299             /* Calculate table index by multiplying r with table scale and truncate to integer */
300             rt               = _mm_mul_ps(r00,vftabscale);
301             vfitab           = _mm_cvttps_epi32(rt);
302 #ifdef __XOP__
303             vfeps            = _mm_frcz_ps(rt);
304 #else
305             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
306 #endif
307             twovfeps         = _mm_add_ps(vfeps,vfeps);
308             vfitab           = _mm_slli_epi32(vfitab,2);
309
310             /* CUBIC SPLINE TABLE ELECTROSTATICS */
311             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
312             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
313             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
314             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
315             _MM_TRANSPOSE4_PS(Y,F,G,H);
316             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
317             VV               = _mm_macc_ps(vfeps,Fp,Y);
318             velec            = _mm_mul_ps(qq00,VV);
319             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
320             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
321
322             /* LENNARD-JONES DISPERSION/REPULSION */
323
324             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
325             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
326             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
327             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
328             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
329
330             /* Update potential sum for this i atom from the interaction with this j atom. */
331             velecsum         = _mm_add_ps(velecsum,velec);
332             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
333
334             fscal            = _mm_add_ps(felec,fvdw);
335
336              /* Update vectorial force */
337             fix0             = _mm_macc_ps(dx00,fscal,fix0);
338             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
339             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
340
341             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
342             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
343             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
344
345             /**************************
346              * CALCULATE INTERACTIONS *
347              **************************/
348
349             r01              = _mm_mul_ps(rsq01,rinv01);
350
351             /* Calculate table index by multiplying r with table scale and truncate to integer */
352             rt               = _mm_mul_ps(r01,vftabscale);
353             vfitab           = _mm_cvttps_epi32(rt);
354 #ifdef __XOP__
355             vfeps            = _mm_frcz_ps(rt);
356 #else
357             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
358 #endif
359             twovfeps         = _mm_add_ps(vfeps,vfeps);
360             vfitab           = _mm_slli_epi32(vfitab,2);
361
362             /* CUBIC SPLINE TABLE ELECTROSTATICS */
363             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
364             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
365             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
366             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
367             _MM_TRANSPOSE4_PS(Y,F,G,H);
368             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
369             VV               = _mm_macc_ps(vfeps,Fp,Y);
370             velec            = _mm_mul_ps(qq01,VV);
371             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
372             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
373
374             /* Update potential sum for this i atom from the interaction with this j atom. */
375             velecsum         = _mm_add_ps(velecsum,velec);
376
377             fscal            = felec;
378
379              /* Update vectorial force */
380             fix0             = _mm_macc_ps(dx01,fscal,fix0);
381             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
382             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
383
384             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
385             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
386             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
387
388             /**************************
389              * CALCULATE INTERACTIONS *
390              **************************/
391
392             r02              = _mm_mul_ps(rsq02,rinv02);
393
394             /* Calculate table index by multiplying r with table scale and truncate to integer */
395             rt               = _mm_mul_ps(r02,vftabscale);
396             vfitab           = _mm_cvttps_epi32(rt);
397 #ifdef __XOP__
398             vfeps            = _mm_frcz_ps(rt);
399 #else
400             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
401 #endif
402             twovfeps         = _mm_add_ps(vfeps,vfeps);
403             vfitab           = _mm_slli_epi32(vfitab,2);
404
405             /* CUBIC SPLINE TABLE ELECTROSTATICS */
406             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
407             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
408             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
409             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
410             _MM_TRANSPOSE4_PS(Y,F,G,H);
411             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
412             VV               = _mm_macc_ps(vfeps,Fp,Y);
413             velec            = _mm_mul_ps(qq02,VV);
414             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
415             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
416
417             /* Update potential sum for this i atom from the interaction with this j atom. */
418             velecsum         = _mm_add_ps(velecsum,velec);
419
420             fscal            = felec;
421
422              /* Update vectorial force */
423             fix0             = _mm_macc_ps(dx02,fscal,fix0);
424             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
425             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
426
427             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
428             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
429             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
430
431             /**************************
432              * CALCULATE INTERACTIONS *
433              **************************/
434
435             r10              = _mm_mul_ps(rsq10,rinv10);
436
437             /* Calculate table index by multiplying r with table scale and truncate to integer */
438             rt               = _mm_mul_ps(r10,vftabscale);
439             vfitab           = _mm_cvttps_epi32(rt);
440 #ifdef __XOP__
441             vfeps            = _mm_frcz_ps(rt);
442 #else
443             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
444 #endif
445             twovfeps         = _mm_add_ps(vfeps,vfeps);
446             vfitab           = _mm_slli_epi32(vfitab,2);
447
448             /* CUBIC SPLINE TABLE ELECTROSTATICS */
449             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
450             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
451             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
452             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
453             _MM_TRANSPOSE4_PS(Y,F,G,H);
454             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
455             VV               = _mm_macc_ps(vfeps,Fp,Y);
456             velec            = _mm_mul_ps(qq10,VV);
457             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
458             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
459
460             /* Update potential sum for this i atom from the interaction with this j atom. */
461             velecsum         = _mm_add_ps(velecsum,velec);
462
463             fscal            = felec;
464
465              /* Update vectorial force */
466             fix1             = _mm_macc_ps(dx10,fscal,fix1);
467             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
468             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
469
470             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
471             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
472             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
473
474             /**************************
475              * CALCULATE INTERACTIONS *
476              **************************/
477
478             r11              = _mm_mul_ps(rsq11,rinv11);
479
480             /* Calculate table index by multiplying r with table scale and truncate to integer */
481             rt               = _mm_mul_ps(r11,vftabscale);
482             vfitab           = _mm_cvttps_epi32(rt);
483 #ifdef __XOP__
484             vfeps            = _mm_frcz_ps(rt);
485 #else
486             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
487 #endif
488             twovfeps         = _mm_add_ps(vfeps,vfeps);
489             vfitab           = _mm_slli_epi32(vfitab,2);
490
491             /* CUBIC SPLINE TABLE ELECTROSTATICS */
492             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
493             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
494             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
495             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
496             _MM_TRANSPOSE4_PS(Y,F,G,H);
497             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
498             VV               = _mm_macc_ps(vfeps,Fp,Y);
499             velec            = _mm_mul_ps(qq11,VV);
500             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
501             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
502
503             /* Update potential sum for this i atom from the interaction with this j atom. */
504             velecsum         = _mm_add_ps(velecsum,velec);
505
506             fscal            = felec;
507
508              /* Update vectorial force */
509             fix1             = _mm_macc_ps(dx11,fscal,fix1);
510             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
511             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
512
513             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
514             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
515             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
516
517             /**************************
518              * CALCULATE INTERACTIONS *
519              **************************/
520
521             r12              = _mm_mul_ps(rsq12,rinv12);
522
523             /* Calculate table index by multiplying r with table scale and truncate to integer */
524             rt               = _mm_mul_ps(r12,vftabscale);
525             vfitab           = _mm_cvttps_epi32(rt);
526 #ifdef __XOP__
527             vfeps            = _mm_frcz_ps(rt);
528 #else
529             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
530 #endif
531             twovfeps         = _mm_add_ps(vfeps,vfeps);
532             vfitab           = _mm_slli_epi32(vfitab,2);
533
534             /* CUBIC SPLINE TABLE ELECTROSTATICS */
535             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
536             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
537             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
538             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
539             _MM_TRANSPOSE4_PS(Y,F,G,H);
540             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
541             VV               = _mm_macc_ps(vfeps,Fp,Y);
542             velec            = _mm_mul_ps(qq12,VV);
543             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
544             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
545
546             /* Update potential sum for this i atom from the interaction with this j atom. */
547             velecsum         = _mm_add_ps(velecsum,velec);
548
549             fscal            = felec;
550
551              /* Update vectorial force */
552             fix1             = _mm_macc_ps(dx12,fscal,fix1);
553             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
554             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
555
556             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
557             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
558             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
559
560             /**************************
561              * CALCULATE INTERACTIONS *
562              **************************/
563
564             r20              = _mm_mul_ps(rsq20,rinv20);
565
566             /* Calculate table index by multiplying r with table scale and truncate to integer */
567             rt               = _mm_mul_ps(r20,vftabscale);
568             vfitab           = _mm_cvttps_epi32(rt);
569 #ifdef __XOP__
570             vfeps            = _mm_frcz_ps(rt);
571 #else
572             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
573 #endif
574             twovfeps         = _mm_add_ps(vfeps,vfeps);
575             vfitab           = _mm_slli_epi32(vfitab,2);
576
577             /* CUBIC SPLINE TABLE ELECTROSTATICS */
578             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
579             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
580             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
581             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
582             _MM_TRANSPOSE4_PS(Y,F,G,H);
583             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
584             VV               = _mm_macc_ps(vfeps,Fp,Y);
585             velec            = _mm_mul_ps(qq20,VV);
586             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
587             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
588
589             /* Update potential sum for this i atom from the interaction with this j atom. */
590             velecsum         = _mm_add_ps(velecsum,velec);
591
592             fscal            = felec;
593
594              /* Update vectorial force */
595             fix2             = _mm_macc_ps(dx20,fscal,fix2);
596             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
597             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
598
599             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
600             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
601             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
602
603             /**************************
604              * CALCULATE INTERACTIONS *
605              **************************/
606
607             r21              = _mm_mul_ps(rsq21,rinv21);
608
609             /* Calculate table index by multiplying r with table scale and truncate to integer */
610             rt               = _mm_mul_ps(r21,vftabscale);
611             vfitab           = _mm_cvttps_epi32(rt);
612 #ifdef __XOP__
613             vfeps            = _mm_frcz_ps(rt);
614 #else
615             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
616 #endif
617             twovfeps         = _mm_add_ps(vfeps,vfeps);
618             vfitab           = _mm_slli_epi32(vfitab,2);
619
620             /* CUBIC SPLINE TABLE ELECTROSTATICS */
621             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
622             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
623             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
624             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
625             _MM_TRANSPOSE4_PS(Y,F,G,H);
626             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
627             VV               = _mm_macc_ps(vfeps,Fp,Y);
628             velec            = _mm_mul_ps(qq21,VV);
629             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
630             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
631
632             /* Update potential sum for this i atom from the interaction with this j atom. */
633             velecsum         = _mm_add_ps(velecsum,velec);
634
635             fscal            = felec;
636
637              /* Update vectorial force */
638             fix2             = _mm_macc_ps(dx21,fscal,fix2);
639             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
640             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
641
642             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
643             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
644             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
645
646             /**************************
647              * CALCULATE INTERACTIONS *
648              **************************/
649
650             r22              = _mm_mul_ps(rsq22,rinv22);
651
652             /* Calculate table index by multiplying r with table scale and truncate to integer */
653             rt               = _mm_mul_ps(r22,vftabscale);
654             vfitab           = _mm_cvttps_epi32(rt);
655 #ifdef __XOP__
656             vfeps            = _mm_frcz_ps(rt);
657 #else
658             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
659 #endif
660             twovfeps         = _mm_add_ps(vfeps,vfeps);
661             vfitab           = _mm_slli_epi32(vfitab,2);
662
663             /* CUBIC SPLINE TABLE ELECTROSTATICS */
664             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
665             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
666             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
667             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
668             _MM_TRANSPOSE4_PS(Y,F,G,H);
669             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
670             VV               = _mm_macc_ps(vfeps,Fp,Y);
671             velec            = _mm_mul_ps(qq22,VV);
672             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
673             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
674
675             /* Update potential sum for this i atom from the interaction with this j atom. */
676             velecsum         = _mm_add_ps(velecsum,velec);
677
678             fscal            = felec;
679
680              /* Update vectorial force */
681             fix2             = _mm_macc_ps(dx22,fscal,fix2);
682             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
683             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
684
685             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
686             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
687             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
688
689             fjptrA             = f+j_coord_offsetA;
690             fjptrB             = f+j_coord_offsetB;
691             fjptrC             = f+j_coord_offsetC;
692             fjptrD             = f+j_coord_offsetD;
693
694             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
695                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
696
697             /* Inner loop uses 427 flops */
698         }
699
700         if(jidx<j_index_end)
701         {
702
703             /* Get j neighbor index, and coordinate index */
704             jnrlistA         = jjnr[jidx];
705             jnrlistB         = jjnr[jidx+1];
706             jnrlistC         = jjnr[jidx+2];
707             jnrlistD         = jjnr[jidx+3];
708             /* Sign of each element will be negative for non-real atoms.
709              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
710              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
711              */
712             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
713             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
714             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
715             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
716             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
717             j_coord_offsetA  = DIM*jnrA;
718             j_coord_offsetB  = DIM*jnrB;
719             j_coord_offsetC  = DIM*jnrC;
720             j_coord_offsetD  = DIM*jnrD;
721
722             /* load j atom coordinates */
723             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
724                                               x+j_coord_offsetC,x+j_coord_offsetD,
725                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
726
727             /* Calculate displacement vector */
728             dx00             = _mm_sub_ps(ix0,jx0);
729             dy00             = _mm_sub_ps(iy0,jy0);
730             dz00             = _mm_sub_ps(iz0,jz0);
731             dx01             = _mm_sub_ps(ix0,jx1);
732             dy01             = _mm_sub_ps(iy0,jy1);
733             dz01             = _mm_sub_ps(iz0,jz1);
734             dx02             = _mm_sub_ps(ix0,jx2);
735             dy02             = _mm_sub_ps(iy0,jy2);
736             dz02             = _mm_sub_ps(iz0,jz2);
737             dx10             = _mm_sub_ps(ix1,jx0);
738             dy10             = _mm_sub_ps(iy1,jy0);
739             dz10             = _mm_sub_ps(iz1,jz0);
740             dx11             = _mm_sub_ps(ix1,jx1);
741             dy11             = _mm_sub_ps(iy1,jy1);
742             dz11             = _mm_sub_ps(iz1,jz1);
743             dx12             = _mm_sub_ps(ix1,jx2);
744             dy12             = _mm_sub_ps(iy1,jy2);
745             dz12             = _mm_sub_ps(iz1,jz2);
746             dx20             = _mm_sub_ps(ix2,jx0);
747             dy20             = _mm_sub_ps(iy2,jy0);
748             dz20             = _mm_sub_ps(iz2,jz0);
749             dx21             = _mm_sub_ps(ix2,jx1);
750             dy21             = _mm_sub_ps(iy2,jy1);
751             dz21             = _mm_sub_ps(iz2,jz1);
752             dx22             = _mm_sub_ps(ix2,jx2);
753             dy22             = _mm_sub_ps(iy2,jy2);
754             dz22             = _mm_sub_ps(iz2,jz2);
755
756             /* Calculate squared distance and things based on it */
757             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
758             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
759             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
760             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
761             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
762             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
763             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
764             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
765             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
766
767             rinv00           = gmx_mm_invsqrt_ps(rsq00);
768             rinv01           = gmx_mm_invsqrt_ps(rsq01);
769             rinv02           = gmx_mm_invsqrt_ps(rsq02);
770             rinv10           = gmx_mm_invsqrt_ps(rsq10);
771             rinv11           = gmx_mm_invsqrt_ps(rsq11);
772             rinv12           = gmx_mm_invsqrt_ps(rsq12);
773             rinv20           = gmx_mm_invsqrt_ps(rsq20);
774             rinv21           = gmx_mm_invsqrt_ps(rsq21);
775             rinv22           = gmx_mm_invsqrt_ps(rsq22);
776
777             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
778
779             fjx0             = _mm_setzero_ps();
780             fjy0             = _mm_setzero_ps();
781             fjz0             = _mm_setzero_ps();
782             fjx1             = _mm_setzero_ps();
783             fjy1             = _mm_setzero_ps();
784             fjz1             = _mm_setzero_ps();
785             fjx2             = _mm_setzero_ps();
786             fjy2             = _mm_setzero_ps();
787             fjz2             = _mm_setzero_ps();
788
789             /**************************
790              * CALCULATE INTERACTIONS *
791              **************************/
792
793             r00              = _mm_mul_ps(rsq00,rinv00);
794             r00              = _mm_andnot_ps(dummy_mask,r00);
795
796             /* Calculate table index by multiplying r with table scale and truncate to integer */
797             rt               = _mm_mul_ps(r00,vftabscale);
798             vfitab           = _mm_cvttps_epi32(rt);
799 #ifdef __XOP__
800             vfeps            = _mm_frcz_ps(rt);
801 #else
802             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
803 #endif
804             twovfeps         = _mm_add_ps(vfeps,vfeps);
805             vfitab           = _mm_slli_epi32(vfitab,2);
806
807             /* CUBIC SPLINE TABLE ELECTROSTATICS */
808             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
809             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
810             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
811             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
812             _MM_TRANSPOSE4_PS(Y,F,G,H);
813             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
814             VV               = _mm_macc_ps(vfeps,Fp,Y);
815             velec            = _mm_mul_ps(qq00,VV);
816             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
817             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
818
819             /* LENNARD-JONES DISPERSION/REPULSION */
820
821             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
822             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
823             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
824             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
825             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
826
827             /* Update potential sum for this i atom from the interaction with this j atom. */
828             velec            = _mm_andnot_ps(dummy_mask,velec);
829             velecsum         = _mm_add_ps(velecsum,velec);
830             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
831             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
832
833             fscal            = _mm_add_ps(felec,fvdw);
834
835             fscal            = _mm_andnot_ps(dummy_mask,fscal);
836
837              /* Update vectorial force */
838             fix0             = _mm_macc_ps(dx00,fscal,fix0);
839             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
840             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
841
842             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
843             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
844             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
845
846             /**************************
847              * CALCULATE INTERACTIONS *
848              **************************/
849
850             r01              = _mm_mul_ps(rsq01,rinv01);
851             r01              = _mm_andnot_ps(dummy_mask,r01);
852
853             /* Calculate table index by multiplying r with table scale and truncate to integer */
854             rt               = _mm_mul_ps(r01,vftabscale);
855             vfitab           = _mm_cvttps_epi32(rt);
856 #ifdef __XOP__
857             vfeps            = _mm_frcz_ps(rt);
858 #else
859             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
860 #endif
861             twovfeps         = _mm_add_ps(vfeps,vfeps);
862             vfitab           = _mm_slli_epi32(vfitab,2);
863
864             /* CUBIC SPLINE TABLE ELECTROSTATICS */
865             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
866             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
867             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
868             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
869             _MM_TRANSPOSE4_PS(Y,F,G,H);
870             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
871             VV               = _mm_macc_ps(vfeps,Fp,Y);
872             velec            = _mm_mul_ps(qq01,VV);
873             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
874             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
875
876             /* Update potential sum for this i atom from the interaction with this j atom. */
877             velec            = _mm_andnot_ps(dummy_mask,velec);
878             velecsum         = _mm_add_ps(velecsum,velec);
879
880             fscal            = felec;
881
882             fscal            = _mm_andnot_ps(dummy_mask,fscal);
883
884              /* Update vectorial force */
885             fix0             = _mm_macc_ps(dx01,fscal,fix0);
886             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
887             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
888
889             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
890             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
891             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
892
893             /**************************
894              * CALCULATE INTERACTIONS *
895              **************************/
896
897             r02              = _mm_mul_ps(rsq02,rinv02);
898             r02              = _mm_andnot_ps(dummy_mask,r02);
899
900             /* Calculate table index by multiplying r with table scale and truncate to integer */
901             rt               = _mm_mul_ps(r02,vftabscale);
902             vfitab           = _mm_cvttps_epi32(rt);
903 #ifdef __XOP__
904             vfeps            = _mm_frcz_ps(rt);
905 #else
906             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
907 #endif
908             twovfeps         = _mm_add_ps(vfeps,vfeps);
909             vfitab           = _mm_slli_epi32(vfitab,2);
910
911             /* CUBIC SPLINE TABLE ELECTROSTATICS */
912             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
913             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
914             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
915             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
916             _MM_TRANSPOSE4_PS(Y,F,G,H);
917             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
918             VV               = _mm_macc_ps(vfeps,Fp,Y);
919             velec            = _mm_mul_ps(qq02,VV);
920             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
921             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
922
923             /* Update potential sum for this i atom from the interaction with this j atom. */
924             velec            = _mm_andnot_ps(dummy_mask,velec);
925             velecsum         = _mm_add_ps(velecsum,velec);
926
927             fscal            = felec;
928
929             fscal            = _mm_andnot_ps(dummy_mask,fscal);
930
931              /* Update vectorial force */
932             fix0             = _mm_macc_ps(dx02,fscal,fix0);
933             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
934             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
935
936             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
937             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
938             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
939
940             /**************************
941              * CALCULATE INTERACTIONS *
942              **************************/
943
944             r10              = _mm_mul_ps(rsq10,rinv10);
945             r10              = _mm_andnot_ps(dummy_mask,r10);
946
947             /* Calculate table index by multiplying r with table scale and truncate to integer */
948             rt               = _mm_mul_ps(r10,vftabscale);
949             vfitab           = _mm_cvttps_epi32(rt);
950 #ifdef __XOP__
951             vfeps            = _mm_frcz_ps(rt);
952 #else
953             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
954 #endif
955             twovfeps         = _mm_add_ps(vfeps,vfeps);
956             vfitab           = _mm_slli_epi32(vfitab,2);
957
958             /* CUBIC SPLINE TABLE ELECTROSTATICS */
959             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
960             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
961             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
962             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
963             _MM_TRANSPOSE4_PS(Y,F,G,H);
964             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
965             VV               = _mm_macc_ps(vfeps,Fp,Y);
966             velec            = _mm_mul_ps(qq10,VV);
967             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
968             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
969
970             /* Update potential sum for this i atom from the interaction with this j atom. */
971             velec            = _mm_andnot_ps(dummy_mask,velec);
972             velecsum         = _mm_add_ps(velecsum,velec);
973
974             fscal            = felec;
975
976             fscal            = _mm_andnot_ps(dummy_mask,fscal);
977
978              /* Update vectorial force */
979             fix1             = _mm_macc_ps(dx10,fscal,fix1);
980             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
981             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
982
983             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
984             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
985             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
986
987             /**************************
988              * CALCULATE INTERACTIONS *
989              **************************/
990
991             r11              = _mm_mul_ps(rsq11,rinv11);
992             r11              = _mm_andnot_ps(dummy_mask,r11);
993
994             /* Calculate table index by multiplying r with table scale and truncate to integer */
995             rt               = _mm_mul_ps(r11,vftabscale);
996             vfitab           = _mm_cvttps_epi32(rt);
997 #ifdef __XOP__
998             vfeps            = _mm_frcz_ps(rt);
999 #else
1000             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1001 #endif
1002             twovfeps         = _mm_add_ps(vfeps,vfeps);
1003             vfitab           = _mm_slli_epi32(vfitab,2);
1004
1005             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1006             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1007             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1008             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1009             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1010             _MM_TRANSPOSE4_PS(Y,F,G,H);
1011             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1012             VV               = _mm_macc_ps(vfeps,Fp,Y);
1013             velec            = _mm_mul_ps(qq11,VV);
1014             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1015             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1016
1017             /* Update potential sum for this i atom from the interaction with this j atom. */
1018             velec            = _mm_andnot_ps(dummy_mask,velec);
1019             velecsum         = _mm_add_ps(velecsum,velec);
1020
1021             fscal            = felec;
1022
1023             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1024
1025              /* Update vectorial force */
1026             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1027             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1028             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1029
1030             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1031             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1032             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1033
1034             /**************************
1035              * CALCULATE INTERACTIONS *
1036              **************************/
1037
1038             r12              = _mm_mul_ps(rsq12,rinv12);
1039             r12              = _mm_andnot_ps(dummy_mask,r12);
1040
1041             /* Calculate table index by multiplying r with table scale and truncate to integer */
1042             rt               = _mm_mul_ps(r12,vftabscale);
1043             vfitab           = _mm_cvttps_epi32(rt);
1044 #ifdef __XOP__
1045             vfeps            = _mm_frcz_ps(rt);
1046 #else
1047             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1048 #endif
1049             twovfeps         = _mm_add_ps(vfeps,vfeps);
1050             vfitab           = _mm_slli_epi32(vfitab,2);
1051
1052             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1053             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1054             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1055             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1056             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1057             _MM_TRANSPOSE4_PS(Y,F,G,H);
1058             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1059             VV               = _mm_macc_ps(vfeps,Fp,Y);
1060             velec            = _mm_mul_ps(qq12,VV);
1061             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1062             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1063
1064             /* Update potential sum for this i atom from the interaction with this j atom. */
1065             velec            = _mm_andnot_ps(dummy_mask,velec);
1066             velecsum         = _mm_add_ps(velecsum,velec);
1067
1068             fscal            = felec;
1069
1070             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1071
1072              /* Update vectorial force */
1073             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1074             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1075             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1076
1077             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1078             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1079             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1080
1081             /**************************
1082              * CALCULATE INTERACTIONS *
1083              **************************/
1084
1085             r20              = _mm_mul_ps(rsq20,rinv20);
1086             r20              = _mm_andnot_ps(dummy_mask,r20);
1087
1088             /* Calculate table index by multiplying r with table scale and truncate to integer */
1089             rt               = _mm_mul_ps(r20,vftabscale);
1090             vfitab           = _mm_cvttps_epi32(rt);
1091 #ifdef __XOP__
1092             vfeps            = _mm_frcz_ps(rt);
1093 #else
1094             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1095 #endif
1096             twovfeps         = _mm_add_ps(vfeps,vfeps);
1097             vfitab           = _mm_slli_epi32(vfitab,2);
1098
1099             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1100             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1101             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1102             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1103             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1104             _MM_TRANSPOSE4_PS(Y,F,G,H);
1105             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1106             VV               = _mm_macc_ps(vfeps,Fp,Y);
1107             velec            = _mm_mul_ps(qq20,VV);
1108             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1109             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1110
1111             /* Update potential sum for this i atom from the interaction with this j atom. */
1112             velec            = _mm_andnot_ps(dummy_mask,velec);
1113             velecsum         = _mm_add_ps(velecsum,velec);
1114
1115             fscal            = felec;
1116
1117             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1118
1119              /* Update vectorial force */
1120             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1121             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1122             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1123
1124             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1125             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1126             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1127
1128             /**************************
1129              * CALCULATE INTERACTIONS *
1130              **************************/
1131
1132             r21              = _mm_mul_ps(rsq21,rinv21);
1133             r21              = _mm_andnot_ps(dummy_mask,r21);
1134
1135             /* Calculate table index by multiplying r with table scale and truncate to integer */
1136             rt               = _mm_mul_ps(r21,vftabscale);
1137             vfitab           = _mm_cvttps_epi32(rt);
1138 #ifdef __XOP__
1139             vfeps            = _mm_frcz_ps(rt);
1140 #else
1141             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1142 #endif
1143             twovfeps         = _mm_add_ps(vfeps,vfeps);
1144             vfitab           = _mm_slli_epi32(vfitab,2);
1145
1146             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1147             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1148             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1149             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1150             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1151             _MM_TRANSPOSE4_PS(Y,F,G,H);
1152             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1153             VV               = _mm_macc_ps(vfeps,Fp,Y);
1154             velec            = _mm_mul_ps(qq21,VV);
1155             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1156             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1157
1158             /* Update potential sum for this i atom from the interaction with this j atom. */
1159             velec            = _mm_andnot_ps(dummy_mask,velec);
1160             velecsum         = _mm_add_ps(velecsum,velec);
1161
1162             fscal            = felec;
1163
1164             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1165
1166              /* Update vectorial force */
1167             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1168             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1169             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1170
1171             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1172             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1173             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1174
1175             /**************************
1176              * CALCULATE INTERACTIONS *
1177              **************************/
1178
1179             r22              = _mm_mul_ps(rsq22,rinv22);
1180             r22              = _mm_andnot_ps(dummy_mask,r22);
1181
1182             /* Calculate table index by multiplying r with table scale and truncate to integer */
1183             rt               = _mm_mul_ps(r22,vftabscale);
1184             vfitab           = _mm_cvttps_epi32(rt);
1185 #ifdef __XOP__
1186             vfeps            = _mm_frcz_ps(rt);
1187 #else
1188             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1189 #endif
1190             twovfeps         = _mm_add_ps(vfeps,vfeps);
1191             vfitab           = _mm_slli_epi32(vfitab,2);
1192
1193             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1194             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1195             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1196             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1197             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1198             _MM_TRANSPOSE4_PS(Y,F,G,H);
1199             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1200             VV               = _mm_macc_ps(vfeps,Fp,Y);
1201             velec            = _mm_mul_ps(qq22,VV);
1202             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1203             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1204
1205             /* Update potential sum for this i atom from the interaction with this j atom. */
1206             velec            = _mm_andnot_ps(dummy_mask,velec);
1207             velecsum         = _mm_add_ps(velecsum,velec);
1208
1209             fscal            = felec;
1210
1211             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1212
1213              /* Update vectorial force */
1214             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1215             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1216             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1217
1218             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1219             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1220             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1221
1222             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1223             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1224             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1225             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1226
1227             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1228                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1229
1230             /* Inner loop uses 436 flops */
1231         }
1232
1233         /* End of innermost loop */
1234
1235         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1236                                               f+i_coord_offset,fshift+i_shift_offset);
1237
1238         ggid                        = gid[iidx];
1239         /* Update potential energies */
1240         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1241         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1242
1243         /* Increment number of inner iterations */
1244         inneriter                  += j_index_end - j_index_start;
1245
1246         /* Outer loop uses 20 flops */
1247     }
1248
1249     /* Increment number of outer iterations */
1250     outeriter        += nri;
1251
1252     /* Update outer/inner flops */
1253
1254     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*436);
1255 }
1256 /*
1257  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_128_fma_single
1258  * Electrostatics interaction: CubicSplineTable
1259  * VdW interaction:            LennardJones
1260  * Geometry:                   Water3-Water3
1261  * Calculate force/pot:        Force
1262  */
1263 void
1264 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_128_fma_single
1265                     (t_nblist                    * gmx_restrict       nlist,
1266                      rvec                        * gmx_restrict          xx,
1267                      rvec                        * gmx_restrict          ff,
1268                      t_forcerec                  * gmx_restrict          fr,
1269                      t_mdatoms                   * gmx_restrict     mdatoms,
1270                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1271                      t_nrnb                      * gmx_restrict        nrnb)
1272 {
1273     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1274      * just 0 for non-waters.
1275      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1276      * jnr indices corresponding to data put in the four positions in the SIMD register.
1277      */
1278     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1279     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1280     int              jnrA,jnrB,jnrC,jnrD;
1281     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1282     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1283     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1284     real             rcutoff_scalar;
1285     real             *shiftvec,*fshift,*x,*f;
1286     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1287     real             scratch[4*DIM];
1288     __m128           fscal,rcutoff,rcutoff2,jidxall;
1289     int              vdwioffset0;
1290     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1291     int              vdwioffset1;
1292     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1293     int              vdwioffset2;
1294     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1295     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1296     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1297     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1298     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1299     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1300     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1301     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1302     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1303     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1304     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1305     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1306     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1307     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1308     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1309     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1310     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1311     real             *charge;
1312     int              nvdwtype;
1313     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1314     int              *vdwtype;
1315     real             *vdwparam;
1316     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1317     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1318     __m128i          vfitab;
1319     __m128i          ifour       = _mm_set1_epi32(4);
1320     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1321     real             *vftab;
1322     __m128           dummy_mask,cutoff_mask;
1323     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1324     __m128           one     = _mm_set1_ps(1.0);
1325     __m128           two     = _mm_set1_ps(2.0);
1326     x                = xx[0];
1327     f                = ff[0];
1328
1329     nri              = nlist->nri;
1330     iinr             = nlist->iinr;
1331     jindex           = nlist->jindex;
1332     jjnr             = nlist->jjnr;
1333     shiftidx         = nlist->shift;
1334     gid              = nlist->gid;
1335     shiftvec         = fr->shift_vec[0];
1336     fshift           = fr->fshift[0];
1337     facel            = _mm_set1_ps(fr->epsfac);
1338     charge           = mdatoms->chargeA;
1339     nvdwtype         = fr->ntype;
1340     vdwparam         = fr->nbfp;
1341     vdwtype          = mdatoms->typeA;
1342
1343     vftab            = kernel_data->table_elec->data;
1344     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
1345
1346     /* Setup water-specific parameters */
1347     inr              = nlist->iinr[0];
1348     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1349     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1350     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1351     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1352
1353     jq0              = _mm_set1_ps(charge[inr+0]);
1354     jq1              = _mm_set1_ps(charge[inr+1]);
1355     jq2              = _mm_set1_ps(charge[inr+2]);
1356     vdwjidx0A        = 2*vdwtype[inr+0];
1357     qq00             = _mm_mul_ps(iq0,jq0);
1358     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1359     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1360     qq01             = _mm_mul_ps(iq0,jq1);
1361     qq02             = _mm_mul_ps(iq0,jq2);
1362     qq10             = _mm_mul_ps(iq1,jq0);
1363     qq11             = _mm_mul_ps(iq1,jq1);
1364     qq12             = _mm_mul_ps(iq1,jq2);
1365     qq20             = _mm_mul_ps(iq2,jq0);
1366     qq21             = _mm_mul_ps(iq2,jq1);
1367     qq22             = _mm_mul_ps(iq2,jq2);
1368
1369     /* Avoid stupid compiler warnings */
1370     jnrA = jnrB = jnrC = jnrD = 0;
1371     j_coord_offsetA = 0;
1372     j_coord_offsetB = 0;
1373     j_coord_offsetC = 0;
1374     j_coord_offsetD = 0;
1375
1376     outeriter        = 0;
1377     inneriter        = 0;
1378
1379     for(iidx=0;iidx<4*DIM;iidx++)
1380     {
1381         scratch[iidx] = 0.0;
1382     }
1383
1384     /* Start outer loop over neighborlists */
1385     for(iidx=0; iidx<nri; iidx++)
1386     {
1387         /* Load shift vector for this list */
1388         i_shift_offset   = DIM*shiftidx[iidx];
1389
1390         /* Load limits for loop over neighbors */
1391         j_index_start    = jindex[iidx];
1392         j_index_end      = jindex[iidx+1];
1393
1394         /* Get outer coordinate index */
1395         inr              = iinr[iidx];
1396         i_coord_offset   = DIM*inr;
1397
1398         /* Load i particle coords and add shift vector */
1399         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1400                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1401
1402         fix0             = _mm_setzero_ps();
1403         fiy0             = _mm_setzero_ps();
1404         fiz0             = _mm_setzero_ps();
1405         fix1             = _mm_setzero_ps();
1406         fiy1             = _mm_setzero_ps();
1407         fiz1             = _mm_setzero_ps();
1408         fix2             = _mm_setzero_ps();
1409         fiy2             = _mm_setzero_ps();
1410         fiz2             = _mm_setzero_ps();
1411
1412         /* Start inner kernel loop */
1413         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1414         {
1415
1416             /* Get j neighbor index, and coordinate index */
1417             jnrA             = jjnr[jidx];
1418             jnrB             = jjnr[jidx+1];
1419             jnrC             = jjnr[jidx+2];
1420             jnrD             = jjnr[jidx+3];
1421             j_coord_offsetA  = DIM*jnrA;
1422             j_coord_offsetB  = DIM*jnrB;
1423             j_coord_offsetC  = DIM*jnrC;
1424             j_coord_offsetD  = DIM*jnrD;
1425
1426             /* load j atom coordinates */
1427             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1428                                               x+j_coord_offsetC,x+j_coord_offsetD,
1429                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1430
1431             /* Calculate displacement vector */
1432             dx00             = _mm_sub_ps(ix0,jx0);
1433             dy00             = _mm_sub_ps(iy0,jy0);
1434             dz00             = _mm_sub_ps(iz0,jz0);
1435             dx01             = _mm_sub_ps(ix0,jx1);
1436             dy01             = _mm_sub_ps(iy0,jy1);
1437             dz01             = _mm_sub_ps(iz0,jz1);
1438             dx02             = _mm_sub_ps(ix0,jx2);
1439             dy02             = _mm_sub_ps(iy0,jy2);
1440             dz02             = _mm_sub_ps(iz0,jz2);
1441             dx10             = _mm_sub_ps(ix1,jx0);
1442             dy10             = _mm_sub_ps(iy1,jy0);
1443             dz10             = _mm_sub_ps(iz1,jz0);
1444             dx11             = _mm_sub_ps(ix1,jx1);
1445             dy11             = _mm_sub_ps(iy1,jy1);
1446             dz11             = _mm_sub_ps(iz1,jz1);
1447             dx12             = _mm_sub_ps(ix1,jx2);
1448             dy12             = _mm_sub_ps(iy1,jy2);
1449             dz12             = _mm_sub_ps(iz1,jz2);
1450             dx20             = _mm_sub_ps(ix2,jx0);
1451             dy20             = _mm_sub_ps(iy2,jy0);
1452             dz20             = _mm_sub_ps(iz2,jz0);
1453             dx21             = _mm_sub_ps(ix2,jx1);
1454             dy21             = _mm_sub_ps(iy2,jy1);
1455             dz21             = _mm_sub_ps(iz2,jz1);
1456             dx22             = _mm_sub_ps(ix2,jx2);
1457             dy22             = _mm_sub_ps(iy2,jy2);
1458             dz22             = _mm_sub_ps(iz2,jz2);
1459
1460             /* Calculate squared distance and things based on it */
1461             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1462             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1463             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1464             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1465             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1466             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1467             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1468             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1469             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1470
1471             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1472             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1473             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1474             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1475             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1476             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1477             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1478             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1479             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1480
1481             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1482
1483             fjx0             = _mm_setzero_ps();
1484             fjy0             = _mm_setzero_ps();
1485             fjz0             = _mm_setzero_ps();
1486             fjx1             = _mm_setzero_ps();
1487             fjy1             = _mm_setzero_ps();
1488             fjz1             = _mm_setzero_ps();
1489             fjx2             = _mm_setzero_ps();
1490             fjy2             = _mm_setzero_ps();
1491             fjz2             = _mm_setzero_ps();
1492
1493             /**************************
1494              * CALCULATE INTERACTIONS *
1495              **************************/
1496
1497             r00              = _mm_mul_ps(rsq00,rinv00);
1498
1499             /* Calculate table index by multiplying r with table scale and truncate to integer */
1500             rt               = _mm_mul_ps(r00,vftabscale);
1501             vfitab           = _mm_cvttps_epi32(rt);
1502 #ifdef __XOP__
1503             vfeps            = _mm_frcz_ps(rt);
1504 #else
1505             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1506 #endif
1507             twovfeps         = _mm_add_ps(vfeps,vfeps);
1508             vfitab           = _mm_slli_epi32(vfitab,2);
1509
1510             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1511             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1512             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1513             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1514             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1515             _MM_TRANSPOSE4_PS(Y,F,G,H);
1516             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1517             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1518             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1519
1520             /* LENNARD-JONES DISPERSION/REPULSION */
1521
1522             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1523             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1524
1525             fscal            = _mm_add_ps(felec,fvdw);
1526
1527              /* Update vectorial force */
1528             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1529             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1530             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1531
1532             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1533             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1534             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1535
1536             /**************************
1537              * CALCULATE INTERACTIONS *
1538              **************************/
1539
1540             r01              = _mm_mul_ps(rsq01,rinv01);
1541
1542             /* Calculate table index by multiplying r with table scale and truncate to integer */
1543             rt               = _mm_mul_ps(r01,vftabscale);
1544             vfitab           = _mm_cvttps_epi32(rt);
1545 #ifdef __XOP__
1546             vfeps            = _mm_frcz_ps(rt);
1547 #else
1548             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1549 #endif
1550             twovfeps         = _mm_add_ps(vfeps,vfeps);
1551             vfitab           = _mm_slli_epi32(vfitab,2);
1552
1553             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1554             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1555             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1556             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1557             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1558             _MM_TRANSPOSE4_PS(Y,F,G,H);
1559             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1560             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1561             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1562
1563             fscal            = felec;
1564
1565              /* Update vectorial force */
1566             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1567             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1568             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1569
1570             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1571             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1572             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1573
1574             /**************************
1575              * CALCULATE INTERACTIONS *
1576              **************************/
1577
1578             r02              = _mm_mul_ps(rsq02,rinv02);
1579
1580             /* Calculate table index by multiplying r with table scale and truncate to integer */
1581             rt               = _mm_mul_ps(r02,vftabscale);
1582             vfitab           = _mm_cvttps_epi32(rt);
1583 #ifdef __XOP__
1584             vfeps            = _mm_frcz_ps(rt);
1585 #else
1586             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1587 #endif
1588             twovfeps         = _mm_add_ps(vfeps,vfeps);
1589             vfitab           = _mm_slli_epi32(vfitab,2);
1590
1591             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1592             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1593             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1594             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1595             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1596             _MM_TRANSPOSE4_PS(Y,F,G,H);
1597             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1598             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1599             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1600
1601             fscal            = felec;
1602
1603              /* Update vectorial force */
1604             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1605             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1606             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1607
1608             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1609             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1610             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1611
1612             /**************************
1613              * CALCULATE INTERACTIONS *
1614              **************************/
1615
1616             r10              = _mm_mul_ps(rsq10,rinv10);
1617
1618             /* Calculate table index by multiplying r with table scale and truncate to integer */
1619             rt               = _mm_mul_ps(r10,vftabscale);
1620             vfitab           = _mm_cvttps_epi32(rt);
1621 #ifdef __XOP__
1622             vfeps            = _mm_frcz_ps(rt);
1623 #else
1624             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1625 #endif
1626             twovfeps         = _mm_add_ps(vfeps,vfeps);
1627             vfitab           = _mm_slli_epi32(vfitab,2);
1628
1629             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1630             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1631             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1632             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1633             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1634             _MM_TRANSPOSE4_PS(Y,F,G,H);
1635             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1636             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1637             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1638
1639             fscal            = felec;
1640
1641              /* Update vectorial force */
1642             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1643             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1644             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1645
1646             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1647             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1648             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1649
1650             /**************************
1651              * CALCULATE INTERACTIONS *
1652              **************************/
1653
1654             r11              = _mm_mul_ps(rsq11,rinv11);
1655
1656             /* Calculate table index by multiplying r with table scale and truncate to integer */
1657             rt               = _mm_mul_ps(r11,vftabscale);
1658             vfitab           = _mm_cvttps_epi32(rt);
1659 #ifdef __XOP__
1660             vfeps            = _mm_frcz_ps(rt);
1661 #else
1662             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1663 #endif
1664             twovfeps         = _mm_add_ps(vfeps,vfeps);
1665             vfitab           = _mm_slli_epi32(vfitab,2);
1666
1667             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1668             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1669             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1670             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1671             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1672             _MM_TRANSPOSE4_PS(Y,F,G,H);
1673             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1674             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1675             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1676
1677             fscal            = felec;
1678
1679              /* Update vectorial force */
1680             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1681             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1682             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1683
1684             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1685             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1686             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1687
1688             /**************************
1689              * CALCULATE INTERACTIONS *
1690              **************************/
1691
1692             r12              = _mm_mul_ps(rsq12,rinv12);
1693
1694             /* Calculate table index by multiplying r with table scale and truncate to integer */
1695             rt               = _mm_mul_ps(r12,vftabscale);
1696             vfitab           = _mm_cvttps_epi32(rt);
1697 #ifdef __XOP__
1698             vfeps            = _mm_frcz_ps(rt);
1699 #else
1700             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1701 #endif
1702             twovfeps         = _mm_add_ps(vfeps,vfeps);
1703             vfitab           = _mm_slli_epi32(vfitab,2);
1704
1705             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1706             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1707             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1708             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1709             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1710             _MM_TRANSPOSE4_PS(Y,F,G,H);
1711             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1712             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1713             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1714
1715             fscal            = felec;
1716
1717              /* Update vectorial force */
1718             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1719             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1720             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1721
1722             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1723             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1724             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1725
1726             /**************************
1727              * CALCULATE INTERACTIONS *
1728              **************************/
1729
1730             r20              = _mm_mul_ps(rsq20,rinv20);
1731
1732             /* Calculate table index by multiplying r with table scale and truncate to integer */
1733             rt               = _mm_mul_ps(r20,vftabscale);
1734             vfitab           = _mm_cvttps_epi32(rt);
1735 #ifdef __XOP__
1736             vfeps            = _mm_frcz_ps(rt);
1737 #else
1738             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1739 #endif
1740             twovfeps         = _mm_add_ps(vfeps,vfeps);
1741             vfitab           = _mm_slli_epi32(vfitab,2);
1742
1743             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1744             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1745             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1746             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1747             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1748             _MM_TRANSPOSE4_PS(Y,F,G,H);
1749             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1750             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1751             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1752
1753             fscal            = felec;
1754
1755              /* Update vectorial force */
1756             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1757             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1758             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1759
1760             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1761             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1762             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1763
1764             /**************************
1765              * CALCULATE INTERACTIONS *
1766              **************************/
1767
1768             r21              = _mm_mul_ps(rsq21,rinv21);
1769
1770             /* Calculate table index by multiplying r with table scale and truncate to integer */
1771             rt               = _mm_mul_ps(r21,vftabscale);
1772             vfitab           = _mm_cvttps_epi32(rt);
1773 #ifdef __XOP__
1774             vfeps            = _mm_frcz_ps(rt);
1775 #else
1776             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1777 #endif
1778             twovfeps         = _mm_add_ps(vfeps,vfeps);
1779             vfitab           = _mm_slli_epi32(vfitab,2);
1780
1781             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1782             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1783             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1784             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1785             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1786             _MM_TRANSPOSE4_PS(Y,F,G,H);
1787             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1788             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1789             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1790
1791             fscal            = felec;
1792
1793              /* Update vectorial force */
1794             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1795             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1796             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1797
1798             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1799             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1800             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1801
1802             /**************************
1803              * CALCULATE INTERACTIONS *
1804              **************************/
1805
1806             r22              = _mm_mul_ps(rsq22,rinv22);
1807
1808             /* Calculate table index by multiplying r with table scale and truncate to integer */
1809             rt               = _mm_mul_ps(r22,vftabscale);
1810             vfitab           = _mm_cvttps_epi32(rt);
1811 #ifdef __XOP__
1812             vfeps            = _mm_frcz_ps(rt);
1813 #else
1814             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1815 #endif
1816             twovfeps         = _mm_add_ps(vfeps,vfeps);
1817             vfitab           = _mm_slli_epi32(vfitab,2);
1818
1819             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1820             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1821             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1822             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1823             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1824             _MM_TRANSPOSE4_PS(Y,F,G,H);
1825             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1826             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1827             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1828
1829             fscal            = felec;
1830
1831              /* Update vectorial force */
1832             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1833             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1834             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1835
1836             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1837             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1838             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1839
1840             fjptrA             = f+j_coord_offsetA;
1841             fjptrB             = f+j_coord_offsetB;
1842             fjptrC             = f+j_coord_offsetC;
1843             fjptrD             = f+j_coord_offsetD;
1844
1845             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1846                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1847
1848             /* Inner loop uses 386 flops */
1849         }
1850
1851         if(jidx<j_index_end)
1852         {
1853
1854             /* Get j neighbor index, and coordinate index */
1855             jnrlistA         = jjnr[jidx];
1856             jnrlistB         = jjnr[jidx+1];
1857             jnrlistC         = jjnr[jidx+2];
1858             jnrlistD         = jjnr[jidx+3];
1859             /* Sign of each element will be negative for non-real atoms.
1860              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1861              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1862              */
1863             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1864             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1865             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1866             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1867             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1868             j_coord_offsetA  = DIM*jnrA;
1869             j_coord_offsetB  = DIM*jnrB;
1870             j_coord_offsetC  = DIM*jnrC;
1871             j_coord_offsetD  = DIM*jnrD;
1872
1873             /* load j atom coordinates */
1874             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1875                                               x+j_coord_offsetC,x+j_coord_offsetD,
1876                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1877
1878             /* Calculate displacement vector */
1879             dx00             = _mm_sub_ps(ix0,jx0);
1880             dy00             = _mm_sub_ps(iy0,jy0);
1881             dz00             = _mm_sub_ps(iz0,jz0);
1882             dx01             = _mm_sub_ps(ix0,jx1);
1883             dy01             = _mm_sub_ps(iy0,jy1);
1884             dz01             = _mm_sub_ps(iz0,jz1);
1885             dx02             = _mm_sub_ps(ix0,jx2);
1886             dy02             = _mm_sub_ps(iy0,jy2);
1887             dz02             = _mm_sub_ps(iz0,jz2);
1888             dx10             = _mm_sub_ps(ix1,jx0);
1889             dy10             = _mm_sub_ps(iy1,jy0);
1890             dz10             = _mm_sub_ps(iz1,jz0);
1891             dx11             = _mm_sub_ps(ix1,jx1);
1892             dy11             = _mm_sub_ps(iy1,jy1);
1893             dz11             = _mm_sub_ps(iz1,jz1);
1894             dx12             = _mm_sub_ps(ix1,jx2);
1895             dy12             = _mm_sub_ps(iy1,jy2);
1896             dz12             = _mm_sub_ps(iz1,jz2);
1897             dx20             = _mm_sub_ps(ix2,jx0);
1898             dy20             = _mm_sub_ps(iy2,jy0);
1899             dz20             = _mm_sub_ps(iz2,jz0);
1900             dx21             = _mm_sub_ps(ix2,jx1);
1901             dy21             = _mm_sub_ps(iy2,jy1);
1902             dz21             = _mm_sub_ps(iz2,jz1);
1903             dx22             = _mm_sub_ps(ix2,jx2);
1904             dy22             = _mm_sub_ps(iy2,jy2);
1905             dz22             = _mm_sub_ps(iz2,jz2);
1906
1907             /* Calculate squared distance and things based on it */
1908             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1909             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1910             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1911             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1912             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1913             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1914             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1915             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1916             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1917
1918             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1919             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1920             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1921             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1922             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1923             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1924             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1925             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1926             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1927
1928             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1929
1930             fjx0             = _mm_setzero_ps();
1931             fjy0             = _mm_setzero_ps();
1932             fjz0             = _mm_setzero_ps();
1933             fjx1             = _mm_setzero_ps();
1934             fjy1             = _mm_setzero_ps();
1935             fjz1             = _mm_setzero_ps();
1936             fjx2             = _mm_setzero_ps();
1937             fjy2             = _mm_setzero_ps();
1938             fjz2             = _mm_setzero_ps();
1939
1940             /**************************
1941              * CALCULATE INTERACTIONS *
1942              **************************/
1943
1944             r00              = _mm_mul_ps(rsq00,rinv00);
1945             r00              = _mm_andnot_ps(dummy_mask,r00);
1946
1947             /* Calculate table index by multiplying r with table scale and truncate to integer */
1948             rt               = _mm_mul_ps(r00,vftabscale);
1949             vfitab           = _mm_cvttps_epi32(rt);
1950 #ifdef __XOP__
1951             vfeps            = _mm_frcz_ps(rt);
1952 #else
1953             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1954 #endif
1955             twovfeps         = _mm_add_ps(vfeps,vfeps);
1956             vfitab           = _mm_slli_epi32(vfitab,2);
1957
1958             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1959             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1960             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1961             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1962             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1963             _MM_TRANSPOSE4_PS(Y,F,G,H);
1964             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1965             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1966             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1967
1968             /* LENNARD-JONES DISPERSION/REPULSION */
1969
1970             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1971             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1972
1973             fscal            = _mm_add_ps(felec,fvdw);
1974
1975             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1976
1977              /* Update vectorial force */
1978             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1979             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1980             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1981
1982             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1983             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1984             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1985
1986             /**************************
1987              * CALCULATE INTERACTIONS *
1988              **************************/
1989
1990             r01              = _mm_mul_ps(rsq01,rinv01);
1991             r01              = _mm_andnot_ps(dummy_mask,r01);
1992
1993             /* Calculate table index by multiplying r with table scale and truncate to integer */
1994             rt               = _mm_mul_ps(r01,vftabscale);
1995             vfitab           = _mm_cvttps_epi32(rt);
1996 #ifdef __XOP__
1997             vfeps            = _mm_frcz_ps(rt);
1998 #else
1999             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2000 #endif
2001             twovfeps         = _mm_add_ps(vfeps,vfeps);
2002             vfitab           = _mm_slli_epi32(vfitab,2);
2003
2004             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2005             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2006             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2007             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2008             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2009             _MM_TRANSPOSE4_PS(Y,F,G,H);
2010             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2011             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2012             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
2013
2014             fscal            = felec;
2015
2016             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2017
2018              /* Update vectorial force */
2019             fix0             = _mm_macc_ps(dx01,fscal,fix0);
2020             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
2021             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
2022
2023             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
2024             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
2025             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
2026
2027             /**************************
2028              * CALCULATE INTERACTIONS *
2029              **************************/
2030
2031             r02              = _mm_mul_ps(rsq02,rinv02);
2032             r02              = _mm_andnot_ps(dummy_mask,r02);
2033
2034             /* Calculate table index by multiplying r with table scale and truncate to integer */
2035             rt               = _mm_mul_ps(r02,vftabscale);
2036             vfitab           = _mm_cvttps_epi32(rt);
2037 #ifdef __XOP__
2038             vfeps            = _mm_frcz_ps(rt);
2039 #else
2040             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2041 #endif
2042             twovfeps         = _mm_add_ps(vfeps,vfeps);
2043             vfitab           = _mm_slli_epi32(vfitab,2);
2044
2045             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2046             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2047             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2048             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2049             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2050             _MM_TRANSPOSE4_PS(Y,F,G,H);
2051             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2052             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2053             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
2054
2055             fscal            = felec;
2056
2057             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2058
2059              /* Update vectorial force */
2060             fix0             = _mm_macc_ps(dx02,fscal,fix0);
2061             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
2062             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
2063
2064             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
2065             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
2066             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
2067
2068             /**************************
2069              * CALCULATE INTERACTIONS *
2070              **************************/
2071
2072             r10              = _mm_mul_ps(rsq10,rinv10);
2073             r10              = _mm_andnot_ps(dummy_mask,r10);
2074
2075             /* Calculate table index by multiplying r with table scale and truncate to integer */
2076             rt               = _mm_mul_ps(r10,vftabscale);
2077             vfitab           = _mm_cvttps_epi32(rt);
2078 #ifdef __XOP__
2079             vfeps            = _mm_frcz_ps(rt);
2080 #else
2081             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2082 #endif
2083             twovfeps         = _mm_add_ps(vfeps,vfeps);
2084             vfitab           = _mm_slli_epi32(vfitab,2);
2085
2086             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2087             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2088             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2089             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2090             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2091             _MM_TRANSPOSE4_PS(Y,F,G,H);
2092             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2093             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2094             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
2095
2096             fscal            = felec;
2097
2098             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2099
2100              /* Update vectorial force */
2101             fix1             = _mm_macc_ps(dx10,fscal,fix1);
2102             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
2103             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
2104
2105             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
2106             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
2107             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
2108
2109             /**************************
2110              * CALCULATE INTERACTIONS *
2111              **************************/
2112
2113             r11              = _mm_mul_ps(rsq11,rinv11);
2114             r11              = _mm_andnot_ps(dummy_mask,r11);
2115
2116             /* Calculate table index by multiplying r with table scale and truncate to integer */
2117             rt               = _mm_mul_ps(r11,vftabscale);
2118             vfitab           = _mm_cvttps_epi32(rt);
2119 #ifdef __XOP__
2120             vfeps            = _mm_frcz_ps(rt);
2121 #else
2122             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2123 #endif
2124             twovfeps         = _mm_add_ps(vfeps,vfeps);
2125             vfitab           = _mm_slli_epi32(vfitab,2);
2126
2127             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2128             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2129             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2130             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2131             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2132             _MM_TRANSPOSE4_PS(Y,F,G,H);
2133             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2134             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2135             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2136
2137             fscal            = felec;
2138
2139             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2140
2141              /* Update vectorial force */
2142             fix1             = _mm_macc_ps(dx11,fscal,fix1);
2143             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
2144             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
2145
2146             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
2147             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
2148             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
2149
2150             /**************************
2151              * CALCULATE INTERACTIONS *
2152              **************************/
2153
2154             r12              = _mm_mul_ps(rsq12,rinv12);
2155             r12              = _mm_andnot_ps(dummy_mask,r12);
2156
2157             /* Calculate table index by multiplying r with table scale and truncate to integer */
2158             rt               = _mm_mul_ps(r12,vftabscale);
2159             vfitab           = _mm_cvttps_epi32(rt);
2160 #ifdef __XOP__
2161             vfeps            = _mm_frcz_ps(rt);
2162 #else
2163             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2164 #endif
2165             twovfeps         = _mm_add_ps(vfeps,vfeps);
2166             vfitab           = _mm_slli_epi32(vfitab,2);
2167
2168             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2169             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2170             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2171             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2172             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2173             _MM_TRANSPOSE4_PS(Y,F,G,H);
2174             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2175             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2176             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2177
2178             fscal            = felec;
2179
2180             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2181
2182              /* Update vectorial force */
2183             fix1             = _mm_macc_ps(dx12,fscal,fix1);
2184             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
2185             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
2186
2187             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
2188             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
2189             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
2190
2191             /**************************
2192              * CALCULATE INTERACTIONS *
2193              **************************/
2194
2195             r20              = _mm_mul_ps(rsq20,rinv20);
2196             r20              = _mm_andnot_ps(dummy_mask,r20);
2197
2198             /* Calculate table index by multiplying r with table scale and truncate to integer */
2199             rt               = _mm_mul_ps(r20,vftabscale);
2200             vfitab           = _mm_cvttps_epi32(rt);
2201 #ifdef __XOP__
2202             vfeps            = _mm_frcz_ps(rt);
2203 #else
2204             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2205 #endif
2206             twovfeps         = _mm_add_ps(vfeps,vfeps);
2207             vfitab           = _mm_slli_epi32(vfitab,2);
2208
2209             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2210             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2211             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2212             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2213             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2214             _MM_TRANSPOSE4_PS(Y,F,G,H);
2215             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2216             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2217             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
2218
2219             fscal            = felec;
2220
2221             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2222
2223              /* Update vectorial force */
2224             fix2             = _mm_macc_ps(dx20,fscal,fix2);
2225             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
2226             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
2227
2228             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
2229             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
2230             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
2231
2232             /**************************
2233              * CALCULATE INTERACTIONS *
2234              **************************/
2235
2236             r21              = _mm_mul_ps(rsq21,rinv21);
2237             r21              = _mm_andnot_ps(dummy_mask,r21);
2238
2239             /* Calculate table index by multiplying r with table scale and truncate to integer */
2240             rt               = _mm_mul_ps(r21,vftabscale);
2241             vfitab           = _mm_cvttps_epi32(rt);
2242 #ifdef __XOP__
2243             vfeps            = _mm_frcz_ps(rt);
2244 #else
2245             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2246 #endif
2247             twovfeps         = _mm_add_ps(vfeps,vfeps);
2248             vfitab           = _mm_slli_epi32(vfitab,2);
2249
2250             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2251             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2252             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2253             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2254             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2255             _MM_TRANSPOSE4_PS(Y,F,G,H);
2256             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2257             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2258             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2259
2260             fscal            = felec;
2261
2262             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2263
2264              /* Update vectorial force */
2265             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2266             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2267             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2268
2269             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2270             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2271             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2272
2273             /**************************
2274              * CALCULATE INTERACTIONS *
2275              **************************/
2276
2277             r22              = _mm_mul_ps(rsq22,rinv22);
2278             r22              = _mm_andnot_ps(dummy_mask,r22);
2279
2280             /* Calculate table index by multiplying r with table scale and truncate to integer */
2281             rt               = _mm_mul_ps(r22,vftabscale);
2282             vfitab           = _mm_cvttps_epi32(rt);
2283 #ifdef __XOP__
2284             vfeps            = _mm_frcz_ps(rt);
2285 #else
2286             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2287 #endif
2288             twovfeps         = _mm_add_ps(vfeps,vfeps);
2289             vfitab           = _mm_slli_epi32(vfitab,2);
2290
2291             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2292             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2293             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2294             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2295             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2296             _MM_TRANSPOSE4_PS(Y,F,G,H);
2297             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2298             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2299             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2300
2301             fscal            = felec;
2302
2303             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2304
2305              /* Update vectorial force */
2306             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2307             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2308             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2309
2310             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2311             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2312             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2313
2314             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2315             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2316             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2317             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2318
2319             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2320                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2321
2322             /* Inner loop uses 395 flops */
2323         }
2324
2325         /* End of innermost loop */
2326
2327         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2328                                               f+i_coord_offset,fshift+i_shift_offset);
2329
2330         /* Increment number of inner iterations */
2331         inneriter                  += j_index_end - j_index_start;
2332
2333         /* Outer loop uses 18 flops */
2334     }
2335
2336     /* Increment number of outer iterations */
2337     outeriter        += nri;
2338
2339     /* Update outer/inner flops */
2340
2341     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*395);
2342 }