Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_avx_128_fma_single.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_128_fma_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_128_fma_single
51  * Electrostatics interaction: CubicSplineTable
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water4-Water4
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_128_fma_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80     real             scratch[4*DIM];
81     __m128           fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwioffset3;
89     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
97     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
105     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
106     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
107     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
108     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
109     real             *charge;
110     int              nvdwtype;
111     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112     int              *vdwtype;
113     real             *vdwparam;
114     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
115     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
116     __m128i          vfitab;
117     __m128i          ifour       = _mm_set1_epi32(4);
118     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
119     real             *vftab;
120     __m128           dummy_mask,cutoff_mask;
121     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
122     __m128           one     = _mm_set1_ps(1.0);
123     __m128           two     = _mm_set1_ps(2.0);
124     x                = xx[0];
125     f                = ff[0];
126
127     nri              = nlist->nri;
128     iinr             = nlist->iinr;
129     jindex           = nlist->jindex;
130     jjnr             = nlist->jjnr;
131     shiftidx         = nlist->shift;
132     gid              = nlist->gid;
133     shiftvec         = fr->shift_vec[0];
134     fshift           = fr->fshift[0];
135     facel            = _mm_set1_ps(fr->ic->epsfac);
136     charge           = mdatoms->chargeA;
137     nvdwtype         = fr->ntype;
138     vdwparam         = fr->nbfp;
139     vdwtype          = mdatoms->typeA;
140
141     vftab            = kernel_data->table_elec->data;
142     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
143
144     /* Setup water-specific parameters */
145     inr              = nlist->iinr[0];
146     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
147     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
148     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
149     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
150
151     jq1              = _mm_set1_ps(charge[inr+1]);
152     jq2              = _mm_set1_ps(charge[inr+2]);
153     jq3              = _mm_set1_ps(charge[inr+3]);
154     vdwjidx0A        = 2*vdwtype[inr+0];
155     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
156     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
157     qq11             = _mm_mul_ps(iq1,jq1);
158     qq12             = _mm_mul_ps(iq1,jq2);
159     qq13             = _mm_mul_ps(iq1,jq3);
160     qq21             = _mm_mul_ps(iq2,jq1);
161     qq22             = _mm_mul_ps(iq2,jq2);
162     qq23             = _mm_mul_ps(iq2,jq3);
163     qq31             = _mm_mul_ps(iq3,jq1);
164     qq32             = _mm_mul_ps(iq3,jq2);
165     qq33             = _mm_mul_ps(iq3,jq3);
166
167     /* Avoid stupid compiler warnings */
168     jnrA = jnrB = jnrC = jnrD = 0;
169     j_coord_offsetA = 0;
170     j_coord_offsetB = 0;
171     j_coord_offsetC = 0;
172     j_coord_offsetD = 0;
173
174     outeriter        = 0;
175     inneriter        = 0;
176
177     for(iidx=0;iidx<4*DIM;iidx++)
178     {
179         scratch[iidx] = 0.0;
180     }
181
182     /* Start outer loop over neighborlists */
183     for(iidx=0; iidx<nri; iidx++)
184     {
185         /* Load shift vector for this list */
186         i_shift_offset   = DIM*shiftidx[iidx];
187
188         /* Load limits for loop over neighbors */
189         j_index_start    = jindex[iidx];
190         j_index_end      = jindex[iidx+1];
191
192         /* Get outer coordinate index */
193         inr              = iinr[iidx];
194         i_coord_offset   = DIM*inr;
195
196         /* Load i particle coords and add shift vector */
197         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
198                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
199
200         fix0             = _mm_setzero_ps();
201         fiy0             = _mm_setzero_ps();
202         fiz0             = _mm_setzero_ps();
203         fix1             = _mm_setzero_ps();
204         fiy1             = _mm_setzero_ps();
205         fiz1             = _mm_setzero_ps();
206         fix2             = _mm_setzero_ps();
207         fiy2             = _mm_setzero_ps();
208         fiz2             = _mm_setzero_ps();
209         fix3             = _mm_setzero_ps();
210         fiy3             = _mm_setzero_ps();
211         fiz3             = _mm_setzero_ps();
212
213         /* Reset potential sums */
214         velecsum         = _mm_setzero_ps();
215         vvdwsum          = _mm_setzero_ps();
216
217         /* Start inner kernel loop */
218         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
219         {
220
221             /* Get j neighbor index, and coordinate index */
222             jnrA             = jjnr[jidx];
223             jnrB             = jjnr[jidx+1];
224             jnrC             = jjnr[jidx+2];
225             jnrD             = jjnr[jidx+3];
226             j_coord_offsetA  = DIM*jnrA;
227             j_coord_offsetB  = DIM*jnrB;
228             j_coord_offsetC  = DIM*jnrC;
229             j_coord_offsetD  = DIM*jnrD;
230
231             /* load j atom coordinates */
232             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
233                                               x+j_coord_offsetC,x+j_coord_offsetD,
234                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
235                                               &jy2,&jz2,&jx3,&jy3,&jz3);
236
237             /* Calculate displacement vector */
238             dx00             = _mm_sub_ps(ix0,jx0);
239             dy00             = _mm_sub_ps(iy0,jy0);
240             dz00             = _mm_sub_ps(iz0,jz0);
241             dx11             = _mm_sub_ps(ix1,jx1);
242             dy11             = _mm_sub_ps(iy1,jy1);
243             dz11             = _mm_sub_ps(iz1,jz1);
244             dx12             = _mm_sub_ps(ix1,jx2);
245             dy12             = _mm_sub_ps(iy1,jy2);
246             dz12             = _mm_sub_ps(iz1,jz2);
247             dx13             = _mm_sub_ps(ix1,jx3);
248             dy13             = _mm_sub_ps(iy1,jy3);
249             dz13             = _mm_sub_ps(iz1,jz3);
250             dx21             = _mm_sub_ps(ix2,jx1);
251             dy21             = _mm_sub_ps(iy2,jy1);
252             dz21             = _mm_sub_ps(iz2,jz1);
253             dx22             = _mm_sub_ps(ix2,jx2);
254             dy22             = _mm_sub_ps(iy2,jy2);
255             dz22             = _mm_sub_ps(iz2,jz2);
256             dx23             = _mm_sub_ps(ix2,jx3);
257             dy23             = _mm_sub_ps(iy2,jy3);
258             dz23             = _mm_sub_ps(iz2,jz3);
259             dx31             = _mm_sub_ps(ix3,jx1);
260             dy31             = _mm_sub_ps(iy3,jy1);
261             dz31             = _mm_sub_ps(iz3,jz1);
262             dx32             = _mm_sub_ps(ix3,jx2);
263             dy32             = _mm_sub_ps(iy3,jy2);
264             dz32             = _mm_sub_ps(iz3,jz2);
265             dx33             = _mm_sub_ps(ix3,jx3);
266             dy33             = _mm_sub_ps(iy3,jy3);
267             dz33             = _mm_sub_ps(iz3,jz3);
268
269             /* Calculate squared distance and things based on it */
270             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
271             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
272             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
273             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
274             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
275             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
276             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
277             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
278             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
279             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
280
281             rinv11           = avx128fma_invsqrt_f(rsq11);
282             rinv12           = avx128fma_invsqrt_f(rsq12);
283             rinv13           = avx128fma_invsqrt_f(rsq13);
284             rinv21           = avx128fma_invsqrt_f(rsq21);
285             rinv22           = avx128fma_invsqrt_f(rsq22);
286             rinv23           = avx128fma_invsqrt_f(rsq23);
287             rinv31           = avx128fma_invsqrt_f(rsq31);
288             rinv32           = avx128fma_invsqrt_f(rsq32);
289             rinv33           = avx128fma_invsqrt_f(rsq33);
290
291             rinvsq00         = avx128fma_inv_f(rsq00);
292
293             fjx0             = _mm_setzero_ps();
294             fjy0             = _mm_setzero_ps();
295             fjz0             = _mm_setzero_ps();
296             fjx1             = _mm_setzero_ps();
297             fjy1             = _mm_setzero_ps();
298             fjz1             = _mm_setzero_ps();
299             fjx2             = _mm_setzero_ps();
300             fjy2             = _mm_setzero_ps();
301             fjz2             = _mm_setzero_ps();
302             fjx3             = _mm_setzero_ps();
303             fjy3             = _mm_setzero_ps();
304             fjz3             = _mm_setzero_ps();
305
306             /**************************
307              * CALCULATE INTERACTIONS *
308              **************************/
309
310             /* LENNARD-JONES DISPERSION/REPULSION */
311
312             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
313             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
314             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
315             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
316             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
317
318             /* Update potential sum for this i atom from the interaction with this j atom. */
319             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
320
321             fscal            = fvdw;
322
323              /* Update vectorial force */
324             fix0             = _mm_macc_ps(dx00,fscal,fix0);
325             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
326             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
327
328             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
329             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
330             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
331
332             /**************************
333              * CALCULATE INTERACTIONS *
334              **************************/
335
336             r11              = _mm_mul_ps(rsq11,rinv11);
337
338             /* Calculate table index by multiplying r with table scale and truncate to integer */
339             rt               = _mm_mul_ps(r11,vftabscale);
340             vfitab           = _mm_cvttps_epi32(rt);
341 #ifdef __XOP__
342             vfeps            = _mm_frcz_ps(rt);
343 #else
344             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
345 #endif
346             twovfeps         = _mm_add_ps(vfeps,vfeps);
347             vfitab           = _mm_slli_epi32(vfitab,2);
348
349             /* CUBIC SPLINE TABLE ELECTROSTATICS */
350             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
351             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
352             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
353             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
354             _MM_TRANSPOSE4_PS(Y,F,G,H);
355             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
356             VV               = _mm_macc_ps(vfeps,Fp,Y);
357             velec            = _mm_mul_ps(qq11,VV);
358             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
359             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
360
361             /* Update potential sum for this i atom from the interaction with this j atom. */
362             velecsum         = _mm_add_ps(velecsum,velec);
363
364             fscal            = felec;
365
366              /* Update vectorial force */
367             fix1             = _mm_macc_ps(dx11,fscal,fix1);
368             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
369             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
370
371             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
372             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
373             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
374
375             /**************************
376              * CALCULATE INTERACTIONS *
377              **************************/
378
379             r12              = _mm_mul_ps(rsq12,rinv12);
380
381             /* Calculate table index by multiplying r with table scale and truncate to integer */
382             rt               = _mm_mul_ps(r12,vftabscale);
383             vfitab           = _mm_cvttps_epi32(rt);
384 #ifdef __XOP__
385             vfeps            = _mm_frcz_ps(rt);
386 #else
387             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
388 #endif
389             twovfeps         = _mm_add_ps(vfeps,vfeps);
390             vfitab           = _mm_slli_epi32(vfitab,2);
391
392             /* CUBIC SPLINE TABLE ELECTROSTATICS */
393             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
394             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
395             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
396             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
397             _MM_TRANSPOSE4_PS(Y,F,G,H);
398             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
399             VV               = _mm_macc_ps(vfeps,Fp,Y);
400             velec            = _mm_mul_ps(qq12,VV);
401             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
402             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
403
404             /* Update potential sum for this i atom from the interaction with this j atom. */
405             velecsum         = _mm_add_ps(velecsum,velec);
406
407             fscal            = felec;
408
409              /* Update vectorial force */
410             fix1             = _mm_macc_ps(dx12,fscal,fix1);
411             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
412             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
413
414             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
415             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
416             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
417
418             /**************************
419              * CALCULATE INTERACTIONS *
420              **************************/
421
422             r13              = _mm_mul_ps(rsq13,rinv13);
423
424             /* Calculate table index by multiplying r with table scale and truncate to integer */
425             rt               = _mm_mul_ps(r13,vftabscale);
426             vfitab           = _mm_cvttps_epi32(rt);
427 #ifdef __XOP__
428             vfeps            = _mm_frcz_ps(rt);
429 #else
430             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
431 #endif
432             twovfeps         = _mm_add_ps(vfeps,vfeps);
433             vfitab           = _mm_slli_epi32(vfitab,2);
434
435             /* CUBIC SPLINE TABLE ELECTROSTATICS */
436             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
437             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
438             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
439             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
440             _MM_TRANSPOSE4_PS(Y,F,G,H);
441             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
442             VV               = _mm_macc_ps(vfeps,Fp,Y);
443             velec            = _mm_mul_ps(qq13,VV);
444             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
445             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
446
447             /* Update potential sum for this i atom from the interaction with this j atom. */
448             velecsum         = _mm_add_ps(velecsum,velec);
449
450             fscal            = felec;
451
452              /* Update vectorial force */
453             fix1             = _mm_macc_ps(dx13,fscal,fix1);
454             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
455             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
456
457             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
458             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
459             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
460
461             /**************************
462              * CALCULATE INTERACTIONS *
463              **************************/
464
465             r21              = _mm_mul_ps(rsq21,rinv21);
466
467             /* Calculate table index by multiplying r with table scale and truncate to integer */
468             rt               = _mm_mul_ps(r21,vftabscale);
469             vfitab           = _mm_cvttps_epi32(rt);
470 #ifdef __XOP__
471             vfeps            = _mm_frcz_ps(rt);
472 #else
473             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
474 #endif
475             twovfeps         = _mm_add_ps(vfeps,vfeps);
476             vfitab           = _mm_slli_epi32(vfitab,2);
477
478             /* CUBIC SPLINE TABLE ELECTROSTATICS */
479             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
480             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
481             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
482             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
483             _MM_TRANSPOSE4_PS(Y,F,G,H);
484             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
485             VV               = _mm_macc_ps(vfeps,Fp,Y);
486             velec            = _mm_mul_ps(qq21,VV);
487             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
488             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
489
490             /* Update potential sum for this i atom from the interaction with this j atom. */
491             velecsum         = _mm_add_ps(velecsum,velec);
492
493             fscal            = felec;
494
495              /* Update vectorial force */
496             fix2             = _mm_macc_ps(dx21,fscal,fix2);
497             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
498             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
499
500             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
501             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
502             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
503
504             /**************************
505              * CALCULATE INTERACTIONS *
506              **************************/
507
508             r22              = _mm_mul_ps(rsq22,rinv22);
509
510             /* Calculate table index by multiplying r with table scale and truncate to integer */
511             rt               = _mm_mul_ps(r22,vftabscale);
512             vfitab           = _mm_cvttps_epi32(rt);
513 #ifdef __XOP__
514             vfeps            = _mm_frcz_ps(rt);
515 #else
516             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
517 #endif
518             twovfeps         = _mm_add_ps(vfeps,vfeps);
519             vfitab           = _mm_slli_epi32(vfitab,2);
520
521             /* CUBIC SPLINE TABLE ELECTROSTATICS */
522             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
523             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
524             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
525             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
526             _MM_TRANSPOSE4_PS(Y,F,G,H);
527             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
528             VV               = _mm_macc_ps(vfeps,Fp,Y);
529             velec            = _mm_mul_ps(qq22,VV);
530             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
531             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
532
533             /* Update potential sum for this i atom from the interaction with this j atom. */
534             velecsum         = _mm_add_ps(velecsum,velec);
535
536             fscal            = felec;
537
538              /* Update vectorial force */
539             fix2             = _mm_macc_ps(dx22,fscal,fix2);
540             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
541             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
542
543             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
544             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
545             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
546
547             /**************************
548              * CALCULATE INTERACTIONS *
549              **************************/
550
551             r23              = _mm_mul_ps(rsq23,rinv23);
552
553             /* Calculate table index by multiplying r with table scale and truncate to integer */
554             rt               = _mm_mul_ps(r23,vftabscale);
555             vfitab           = _mm_cvttps_epi32(rt);
556 #ifdef __XOP__
557             vfeps            = _mm_frcz_ps(rt);
558 #else
559             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
560 #endif
561             twovfeps         = _mm_add_ps(vfeps,vfeps);
562             vfitab           = _mm_slli_epi32(vfitab,2);
563
564             /* CUBIC SPLINE TABLE ELECTROSTATICS */
565             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
566             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
567             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
568             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
569             _MM_TRANSPOSE4_PS(Y,F,G,H);
570             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
571             VV               = _mm_macc_ps(vfeps,Fp,Y);
572             velec            = _mm_mul_ps(qq23,VV);
573             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
574             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
575
576             /* Update potential sum for this i atom from the interaction with this j atom. */
577             velecsum         = _mm_add_ps(velecsum,velec);
578
579             fscal            = felec;
580
581              /* Update vectorial force */
582             fix2             = _mm_macc_ps(dx23,fscal,fix2);
583             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
584             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
585
586             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
587             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
588             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
589
590             /**************************
591              * CALCULATE INTERACTIONS *
592              **************************/
593
594             r31              = _mm_mul_ps(rsq31,rinv31);
595
596             /* Calculate table index by multiplying r with table scale and truncate to integer */
597             rt               = _mm_mul_ps(r31,vftabscale);
598             vfitab           = _mm_cvttps_epi32(rt);
599 #ifdef __XOP__
600             vfeps            = _mm_frcz_ps(rt);
601 #else
602             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
603 #endif
604             twovfeps         = _mm_add_ps(vfeps,vfeps);
605             vfitab           = _mm_slli_epi32(vfitab,2);
606
607             /* CUBIC SPLINE TABLE ELECTROSTATICS */
608             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
609             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
610             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
611             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
612             _MM_TRANSPOSE4_PS(Y,F,G,H);
613             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
614             VV               = _mm_macc_ps(vfeps,Fp,Y);
615             velec            = _mm_mul_ps(qq31,VV);
616             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
617             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
618
619             /* Update potential sum for this i atom from the interaction with this j atom. */
620             velecsum         = _mm_add_ps(velecsum,velec);
621
622             fscal            = felec;
623
624              /* Update vectorial force */
625             fix3             = _mm_macc_ps(dx31,fscal,fix3);
626             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
627             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
628
629             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
630             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
631             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
632
633             /**************************
634              * CALCULATE INTERACTIONS *
635              **************************/
636
637             r32              = _mm_mul_ps(rsq32,rinv32);
638
639             /* Calculate table index by multiplying r with table scale and truncate to integer */
640             rt               = _mm_mul_ps(r32,vftabscale);
641             vfitab           = _mm_cvttps_epi32(rt);
642 #ifdef __XOP__
643             vfeps            = _mm_frcz_ps(rt);
644 #else
645             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
646 #endif
647             twovfeps         = _mm_add_ps(vfeps,vfeps);
648             vfitab           = _mm_slli_epi32(vfitab,2);
649
650             /* CUBIC SPLINE TABLE ELECTROSTATICS */
651             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
652             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
653             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
654             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
655             _MM_TRANSPOSE4_PS(Y,F,G,H);
656             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
657             VV               = _mm_macc_ps(vfeps,Fp,Y);
658             velec            = _mm_mul_ps(qq32,VV);
659             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
660             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
661
662             /* Update potential sum for this i atom from the interaction with this j atom. */
663             velecsum         = _mm_add_ps(velecsum,velec);
664
665             fscal            = felec;
666
667              /* Update vectorial force */
668             fix3             = _mm_macc_ps(dx32,fscal,fix3);
669             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
670             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
671
672             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
673             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
674             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
675
676             /**************************
677              * CALCULATE INTERACTIONS *
678              **************************/
679
680             r33              = _mm_mul_ps(rsq33,rinv33);
681
682             /* Calculate table index by multiplying r with table scale and truncate to integer */
683             rt               = _mm_mul_ps(r33,vftabscale);
684             vfitab           = _mm_cvttps_epi32(rt);
685 #ifdef __XOP__
686             vfeps            = _mm_frcz_ps(rt);
687 #else
688             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
689 #endif
690             twovfeps         = _mm_add_ps(vfeps,vfeps);
691             vfitab           = _mm_slli_epi32(vfitab,2);
692
693             /* CUBIC SPLINE TABLE ELECTROSTATICS */
694             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
695             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
696             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
697             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
698             _MM_TRANSPOSE4_PS(Y,F,G,H);
699             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
700             VV               = _mm_macc_ps(vfeps,Fp,Y);
701             velec            = _mm_mul_ps(qq33,VV);
702             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
703             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
704
705             /* Update potential sum for this i atom from the interaction with this j atom. */
706             velecsum         = _mm_add_ps(velecsum,velec);
707
708             fscal            = felec;
709
710              /* Update vectorial force */
711             fix3             = _mm_macc_ps(dx33,fscal,fix3);
712             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
713             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
714
715             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
716             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
717             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
718
719             fjptrA             = f+j_coord_offsetA;
720             fjptrB             = f+j_coord_offsetB;
721             fjptrC             = f+j_coord_offsetC;
722             fjptrD             = f+j_coord_offsetD;
723
724             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
725                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
726                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
727
728             /* Inner loop uses 452 flops */
729         }
730
731         if(jidx<j_index_end)
732         {
733
734             /* Get j neighbor index, and coordinate index */
735             jnrlistA         = jjnr[jidx];
736             jnrlistB         = jjnr[jidx+1];
737             jnrlistC         = jjnr[jidx+2];
738             jnrlistD         = jjnr[jidx+3];
739             /* Sign of each element will be negative for non-real atoms.
740              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
741              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
742              */
743             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
744             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
745             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
746             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
747             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
748             j_coord_offsetA  = DIM*jnrA;
749             j_coord_offsetB  = DIM*jnrB;
750             j_coord_offsetC  = DIM*jnrC;
751             j_coord_offsetD  = DIM*jnrD;
752
753             /* load j atom coordinates */
754             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
755                                               x+j_coord_offsetC,x+j_coord_offsetD,
756                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
757                                               &jy2,&jz2,&jx3,&jy3,&jz3);
758
759             /* Calculate displacement vector */
760             dx00             = _mm_sub_ps(ix0,jx0);
761             dy00             = _mm_sub_ps(iy0,jy0);
762             dz00             = _mm_sub_ps(iz0,jz0);
763             dx11             = _mm_sub_ps(ix1,jx1);
764             dy11             = _mm_sub_ps(iy1,jy1);
765             dz11             = _mm_sub_ps(iz1,jz1);
766             dx12             = _mm_sub_ps(ix1,jx2);
767             dy12             = _mm_sub_ps(iy1,jy2);
768             dz12             = _mm_sub_ps(iz1,jz2);
769             dx13             = _mm_sub_ps(ix1,jx3);
770             dy13             = _mm_sub_ps(iy1,jy3);
771             dz13             = _mm_sub_ps(iz1,jz3);
772             dx21             = _mm_sub_ps(ix2,jx1);
773             dy21             = _mm_sub_ps(iy2,jy1);
774             dz21             = _mm_sub_ps(iz2,jz1);
775             dx22             = _mm_sub_ps(ix2,jx2);
776             dy22             = _mm_sub_ps(iy2,jy2);
777             dz22             = _mm_sub_ps(iz2,jz2);
778             dx23             = _mm_sub_ps(ix2,jx3);
779             dy23             = _mm_sub_ps(iy2,jy3);
780             dz23             = _mm_sub_ps(iz2,jz3);
781             dx31             = _mm_sub_ps(ix3,jx1);
782             dy31             = _mm_sub_ps(iy3,jy1);
783             dz31             = _mm_sub_ps(iz3,jz1);
784             dx32             = _mm_sub_ps(ix3,jx2);
785             dy32             = _mm_sub_ps(iy3,jy2);
786             dz32             = _mm_sub_ps(iz3,jz2);
787             dx33             = _mm_sub_ps(ix3,jx3);
788             dy33             = _mm_sub_ps(iy3,jy3);
789             dz33             = _mm_sub_ps(iz3,jz3);
790
791             /* Calculate squared distance and things based on it */
792             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
793             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
794             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
795             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
796             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
797             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
798             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
799             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
800             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
801             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
802
803             rinv11           = avx128fma_invsqrt_f(rsq11);
804             rinv12           = avx128fma_invsqrt_f(rsq12);
805             rinv13           = avx128fma_invsqrt_f(rsq13);
806             rinv21           = avx128fma_invsqrt_f(rsq21);
807             rinv22           = avx128fma_invsqrt_f(rsq22);
808             rinv23           = avx128fma_invsqrt_f(rsq23);
809             rinv31           = avx128fma_invsqrt_f(rsq31);
810             rinv32           = avx128fma_invsqrt_f(rsq32);
811             rinv33           = avx128fma_invsqrt_f(rsq33);
812
813             rinvsq00         = avx128fma_inv_f(rsq00);
814
815             fjx0             = _mm_setzero_ps();
816             fjy0             = _mm_setzero_ps();
817             fjz0             = _mm_setzero_ps();
818             fjx1             = _mm_setzero_ps();
819             fjy1             = _mm_setzero_ps();
820             fjz1             = _mm_setzero_ps();
821             fjx2             = _mm_setzero_ps();
822             fjy2             = _mm_setzero_ps();
823             fjz2             = _mm_setzero_ps();
824             fjx3             = _mm_setzero_ps();
825             fjy3             = _mm_setzero_ps();
826             fjz3             = _mm_setzero_ps();
827
828             /**************************
829              * CALCULATE INTERACTIONS *
830              **************************/
831
832             /* LENNARD-JONES DISPERSION/REPULSION */
833
834             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
835             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
836             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
837             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
838             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
839
840             /* Update potential sum for this i atom from the interaction with this j atom. */
841             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
842             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
843
844             fscal            = fvdw;
845
846             fscal            = _mm_andnot_ps(dummy_mask,fscal);
847
848              /* Update vectorial force */
849             fix0             = _mm_macc_ps(dx00,fscal,fix0);
850             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
851             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
852
853             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
854             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
855             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
856
857             /**************************
858              * CALCULATE INTERACTIONS *
859              **************************/
860
861             r11              = _mm_mul_ps(rsq11,rinv11);
862             r11              = _mm_andnot_ps(dummy_mask,r11);
863
864             /* Calculate table index by multiplying r with table scale and truncate to integer */
865             rt               = _mm_mul_ps(r11,vftabscale);
866             vfitab           = _mm_cvttps_epi32(rt);
867 #ifdef __XOP__
868             vfeps            = _mm_frcz_ps(rt);
869 #else
870             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
871 #endif
872             twovfeps         = _mm_add_ps(vfeps,vfeps);
873             vfitab           = _mm_slli_epi32(vfitab,2);
874
875             /* CUBIC SPLINE TABLE ELECTROSTATICS */
876             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
877             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
878             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
879             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
880             _MM_TRANSPOSE4_PS(Y,F,G,H);
881             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
882             VV               = _mm_macc_ps(vfeps,Fp,Y);
883             velec            = _mm_mul_ps(qq11,VV);
884             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
885             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
886
887             /* Update potential sum for this i atom from the interaction with this j atom. */
888             velec            = _mm_andnot_ps(dummy_mask,velec);
889             velecsum         = _mm_add_ps(velecsum,velec);
890
891             fscal            = felec;
892
893             fscal            = _mm_andnot_ps(dummy_mask,fscal);
894
895              /* Update vectorial force */
896             fix1             = _mm_macc_ps(dx11,fscal,fix1);
897             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
898             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
899
900             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
901             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
902             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
903
904             /**************************
905              * CALCULATE INTERACTIONS *
906              **************************/
907
908             r12              = _mm_mul_ps(rsq12,rinv12);
909             r12              = _mm_andnot_ps(dummy_mask,r12);
910
911             /* Calculate table index by multiplying r with table scale and truncate to integer */
912             rt               = _mm_mul_ps(r12,vftabscale);
913             vfitab           = _mm_cvttps_epi32(rt);
914 #ifdef __XOP__
915             vfeps            = _mm_frcz_ps(rt);
916 #else
917             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
918 #endif
919             twovfeps         = _mm_add_ps(vfeps,vfeps);
920             vfitab           = _mm_slli_epi32(vfitab,2);
921
922             /* CUBIC SPLINE TABLE ELECTROSTATICS */
923             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
924             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
925             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
926             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
927             _MM_TRANSPOSE4_PS(Y,F,G,H);
928             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
929             VV               = _mm_macc_ps(vfeps,Fp,Y);
930             velec            = _mm_mul_ps(qq12,VV);
931             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
932             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
933
934             /* Update potential sum for this i atom from the interaction with this j atom. */
935             velec            = _mm_andnot_ps(dummy_mask,velec);
936             velecsum         = _mm_add_ps(velecsum,velec);
937
938             fscal            = felec;
939
940             fscal            = _mm_andnot_ps(dummy_mask,fscal);
941
942              /* Update vectorial force */
943             fix1             = _mm_macc_ps(dx12,fscal,fix1);
944             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
945             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
946
947             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
948             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
949             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
950
951             /**************************
952              * CALCULATE INTERACTIONS *
953              **************************/
954
955             r13              = _mm_mul_ps(rsq13,rinv13);
956             r13              = _mm_andnot_ps(dummy_mask,r13);
957
958             /* Calculate table index by multiplying r with table scale and truncate to integer */
959             rt               = _mm_mul_ps(r13,vftabscale);
960             vfitab           = _mm_cvttps_epi32(rt);
961 #ifdef __XOP__
962             vfeps            = _mm_frcz_ps(rt);
963 #else
964             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
965 #endif
966             twovfeps         = _mm_add_ps(vfeps,vfeps);
967             vfitab           = _mm_slli_epi32(vfitab,2);
968
969             /* CUBIC SPLINE TABLE ELECTROSTATICS */
970             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
971             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
972             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
973             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
974             _MM_TRANSPOSE4_PS(Y,F,G,H);
975             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
976             VV               = _mm_macc_ps(vfeps,Fp,Y);
977             velec            = _mm_mul_ps(qq13,VV);
978             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
979             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
980
981             /* Update potential sum for this i atom from the interaction with this j atom. */
982             velec            = _mm_andnot_ps(dummy_mask,velec);
983             velecsum         = _mm_add_ps(velecsum,velec);
984
985             fscal            = felec;
986
987             fscal            = _mm_andnot_ps(dummy_mask,fscal);
988
989              /* Update vectorial force */
990             fix1             = _mm_macc_ps(dx13,fscal,fix1);
991             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
992             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
993
994             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
995             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
996             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
997
998             /**************************
999              * CALCULATE INTERACTIONS *
1000              **************************/
1001
1002             r21              = _mm_mul_ps(rsq21,rinv21);
1003             r21              = _mm_andnot_ps(dummy_mask,r21);
1004
1005             /* Calculate table index by multiplying r with table scale and truncate to integer */
1006             rt               = _mm_mul_ps(r21,vftabscale);
1007             vfitab           = _mm_cvttps_epi32(rt);
1008 #ifdef __XOP__
1009             vfeps            = _mm_frcz_ps(rt);
1010 #else
1011             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1012 #endif
1013             twovfeps         = _mm_add_ps(vfeps,vfeps);
1014             vfitab           = _mm_slli_epi32(vfitab,2);
1015
1016             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1017             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1018             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1019             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1020             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1021             _MM_TRANSPOSE4_PS(Y,F,G,H);
1022             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1023             VV               = _mm_macc_ps(vfeps,Fp,Y);
1024             velec            = _mm_mul_ps(qq21,VV);
1025             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1026             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1027
1028             /* Update potential sum for this i atom from the interaction with this j atom. */
1029             velec            = _mm_andnot_ps(dummy_mask,velec);
1030             velecsum         = _mm_add_ps(velecsum,velec);
1031
1032             fscal            = felec;
1033
1034             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1035
1036              /* Update vectorial force */
1037             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1038             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1039             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1040
1041             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1042             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1043             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1044
1045             /**************************
1046              * CALCULATE INTERACTIONS *
1047              **************************/
1048
1049             r22              = _mm_mul_ps(rsq22,rinv22);
1050             r22              = _mm_andnot_ps(dummy_mask,r22);
1051
1052             /* Calculate table index by multiplying r with table scale and truncate to integer */
1053             rt               = _mm_mul_ps(r22,vftabscale);
1054             vfitab           = _mm_cvttps_epi32(rt);
1055 #ifdef __XOP__
1056             vfeps            = _mm_frcz_ps(rt);
1057 #else
1058             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1059 #endif
1060             twovfeps         = _mm_add_ps(vfeps,vfeps);
1061             vfitab           = _mm_slli_epi32(vfitab,2);
1062
1063             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1064             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1065             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1066             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1067             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1068             _MM_TRANSPOSE4_PS(Y,F,G,H);
1069             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1070             VV               = _mm_macc_ps(vfeps,Fp,Y);
1071             velec            = _mm_mul_ps(qq22,VV);
1072             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1073             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1074
1075             /* Update potential sum for this i atom from the interaction with this j atom. */
1076             velec            = _mm_andnot_ps(dummy_mask,velec);
1077             velecsum         = _mm_add_ps(velecsum,velec);
1078
1079             fscal            = felec;
1080
1081             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1082
1083              /* Update vectorial force */
1084             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1085             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1086             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1087
1088             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1089             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1090             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1091
1092             /**************************
1093              * CALCULATE INTERACTIONS *
1094              **************************/
1095
1096             r23              = _mm_mul_ps(rsq23,rinv23);
1097             r23              = _mm_andnot_ps(dummy_mask,r23);
1098
1099             /* Calculate table index by multiplying r with table scale and truncate to integer */
1100             rt               = _mm_mul_ps(r23,vftabscale);
1101             vfitab           = _mm_cvttps_epi32(rt);
1102 #ifdef __XOP__
1103             vfeps            = _mm_frcz_ps(rt);
1104 #else
1105             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1106 #endif
1107             twovfeps         = _mm_add_ps(vfeps,vfeps);
1108             vfitab           = _mm_slli_epi32(vfitab,2);
1109
1110             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1111             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1112             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1113             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1114             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1115             _MM_TRANSPOSE4_PS(Y,F,G,H);
1116             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1117             VV               = _mm_macc_ps(vfeps,Fp,Y);
1118             velec            = _mm_mul_ps(qq23,VV);
1119             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1120             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1121
1122             /* Update potential sum for this i atom from the interaction with this j atom. */
1123             velec            = _mm_andnot_ps(dummy_mask,velec);
1124             velecsum         = _mm_add_ps(velecsum,velec);
1125
1126             fscal            = felec;
1127
1128             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1129
1130              /* Update vectorial force */
1131             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1132             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1133             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1134
1135             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1136             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1137             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1138
1139             /**************************
1140              * CALCULATE INTERACTIONS *
1141              **************************/
1142
1143             r31              = _mm_mul_ps(rsq31,rinv31);
1144             r31              = _mm_andnot_ps(dummy_mask,r31);
1145
1146             /* Calculate table index by multiplying r with table scale and truncate to integer */
1147             rt               = _mm_mul_ps(r31,vftabscale);
1148             vfitab           = _mm_cvttps_epi32(rt);
1149 #ifdef __XOP__
1150             vfeps            = _mm_frcz_ps(rt);
1151 #else
1152             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1153 #endif
1154             twovfeps         = _mm_add_ps(vfeps,vfeps);
1155             vfitab           = _mm_slli_epi32(vfitab,2);
1156
1157             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1158             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1159             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1160             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1161             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1162             _MM_TRANSPOSE4_PS(Y,F,G,H);
1163             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1164             VV               = _mm_macc_ps(vfeps,Fp,Y);
1165             velec            = _mm_mul_ps(qq31,VV);
1166             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1167             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1168
1169             /* Update potential sum for this i atom from the interaction with this j atom. */
1170             velec            = _mm_andnot_ps(dummy_mask,velec);
1171             velecsum         = _mm_add_ps(velecsum,velec);
1172
1173             fscal            = felec;
1174
1175             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1176
1177              /* Update vectorial force */
1178             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1179             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1180             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1181
1182             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1183             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1184             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1185
1186             /**************************
1187              * CALCULATE INTERACTIONS *
1188              **************************/
1189
1190             r32              = _mm_mul_ps(rsq32,rinv32);
1191             r32              = _mm_andnot_ps(dummy_mask,r32);
1192
1193             /* Calculate table index by multiplying r with table scale and truncate to integer */
1194             rt               = _mm_mul_ps(r32,vftabscale);
1195             vfitab           = _mm_cvttps_epi32(rt);
1196 #ifdef __XOP__
1197             vfeps            = _mm_frcz_ps(rt);
1198 #else
1199             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1200 #endif
1201             twovfeps         = _mm_add_ps(vfeps,vfeps);
1202             vfitab           = _mm_slli_epi32(vfitab,2);
1203
1204             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1205             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1206             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1207             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1208             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1209             _MM_TRANSPOSE4_PS(Y,F,G,H);
1210             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1211             VV               = _mm_macc_ps(vfeps,Fp,Y);
1212             velec            = _mm_mul_ps(qq32,VV);
1213             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1214             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1215
1216             /* Update potential sum for this i atom from the interaction with this j atom. */
1217             velec            = _mm_andnot_ps(dummy_mask,velec);
1218             velecsum         = _mm_add_ps(velecsum,velec);
1219
1220             fscal            = felec;
1221
1222             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1223
1224              /* Update vectorial force */
1225             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1226             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1227             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1228
1229             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1230             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1231             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1232
1233             /**************************
1234              * CALCULATE INTERACTIONS *
1235              **************************/
1236
1237             r33              = _mm_mul_ps(rsq33,rinv33);
1238             r33              = _mm_andnot_ps(dummy_mask,r33);
1239
1240             /* Calculate table index by multiplying r with table scale and truncate to integer */
1241             rt               = _mm_mul_ps(r33,vftabscale);
1242             vfitab           = _mm_cvttps_epi32(rt);
1243 #ifdef __XOP__
1244             vfeps            = _mm_frcz_ps(rt);
1245 #else
1246             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1247 #endif
1248             twovfeps         = _mm_add_ps(vfeps,vfeps);
1249             vfitab           = _mm_slli_epi32(vfitab,2);
1250
1251             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1252             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1253             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1254             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1255             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1256             _MM_TRANSPOSE4_PS(Y,F,G,H);
1257             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1258             VV               = _mm_macc_ps(vfeps,Fp,Y);
1259             velec            = _mm_mul_ps(qq33,VV);
1260             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1261             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1262
1263             /* Update potential sum for this i atom from the interaction with this j atom. */
1264             velec            = _mm_andnot_ps(dummy_mask,velec);
1265             velecsum         = _mm_add_ps(velecsum,velec);
1266
1267             fscal            = felec;
1268
1269             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1270
1271              /* Update vectorial force */
1272             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1273             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1274             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1275
1276             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1277             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1278             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1279
1280             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1281             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1282             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1283             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1284
1285             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1286                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1287                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1288
1289             /* Inner loop uses 461 flops */
1290         }
1291
1292         /* End of innermost loop */
1293
1294         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1295                                               f+i_coord_offset,fshift+i_shift_offset);
1296
1297         ggid                        = gid[iidx];
1298         /* Update potential energies */
1299         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1300         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1301
1302         /* Increment number of inner iterations */
1303         inneriter                  += j_index_end - j_index_start;
1304
1305         /* Outer loop uses 26 flops */
1306     }
1307
1308     /* Increment number of outer iterations */
1309     outeriter        += nri;
1310
1311     /* Update outer/inner flops */
1312
1313     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*461);
1314 }
1315 /*
1316  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_128_fma_single
1317  * Electrostatics interaction: CubicSplineTable
1318  * VdW interaction:            LennardJones
1319  * Geometry:                   Water4-Water4
1320  * Calculate force/pot:        Force
1321  */
1322 void
1323 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_128_fma_single
1324                     (t_nblist                    * gmx_restrict       nlist,
1325                      rvec                        * gmx_restrict          xx,
1326                      rvec                        * gmx_restrict          ff,
1327                      struct t_forcerec           * gmx_restrict          fr,
1328                      t_mdatoms                   * gmx_restrict     mdatoms,
1329                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1330                      t_nrnb                      * gmx_restrict        nrnb)
1331 {
1332     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1333      * just 0 for non-waters.
1334      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1335      * jnr indices corresponding to data put in the four positions in the SIMD register.
1336      */
1337     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1338     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1339     int              jnrA,jnrB,jnrC,jnrD;
1340     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1341     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1342     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1343     real             rcutoff_scalar;
1344     real             *shiftvec,*fshift,*x,*f;
1345     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1346     real             scratch[4*DIM];
1347     __m128           fscal,rcutoff,rcutoff2,jidxall;
1348     int              vdwioffset0;
1349     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1350     int              vdwioffset1;
1351     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1352     int              vdwioffset2;
1353     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1354     int              vdwioffset3;
1355     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1356     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1357     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1358     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1359     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1360     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1361     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1362     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1363     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1364     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1365     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1366     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1367     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1368     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1369     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1370     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1371     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1372     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1373     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1374     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1375     real             *charge;
1376     int              nvdwtype;
1377     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1378     int              *vdwtype;
1379     real             *vdwparam;
1380     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1381     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1382     __m128i          vfitab;
1383     __m128i          ifour       = _mm_set1_epi32(4);
1384     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1385     real             *vftab;
1386     __m128           dummy_mask,cutoff_mask;
1387     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1388     __m128           one     = _mm_set1_ps(1.0);
1389     __m128           two     = _mm_set1_ps(2.0);
1390     x                = xx[0];
1391     f                = ff[0];
1392
1393     nri              = nlist->nri;
1394     iinr             = nlist->iinr;
1395     jindex           = nlist->jindex;
1396     jjnr             = nlist->jjnr;
1397     shiftidx         = nlist->shift;
1398     gid              = nlist->gid;
1399     shiftvec         = fr->shift_vec[0];
1400     fshift           = fr->fshift[0];
1401     facel            = _mm_set1_ps(fr->ic->epsfac);
1402     charge           = mdatoms->chargeA;
1403     nvdwtype         = fr->ntype;
1404     vdwparam         = fr->nbfp;
1405     vdwtype          = mdatoms->typeA;
1406
1407     vftab            = kernel_data->table_elec->data;
1408     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
1409
1410     /* Setup water-specific parameters */
1411     inr              = nlist->iinr[0];
1412     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1413     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1414     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1415     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1416
1417     jq1              = _mm_set1_ps(charge[inr+1]);
1418     jq2              = _mm_set1_ps(charge[inr+2]);
1419     jq3              = _mm_set1_ps(charge[inr+3]);
1420     vdwjidx0A        = 2*vdwtype[inr+0];
1421     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1422     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1423     qq11             = _mm_mul_ps(iq1,jq1);
1424     qq12             = _mm_mul_ps(iq1,jq2);
1425     qq13             = _mm_mul_ps(iq1,jq3);
1426     qq21             = _mm_mul_ps(iq2,jq1);
1427     qq22             = _mm_mul_ps(iq2,jq2);
1428     qq23             = _mm_mul_ps(iq2,jq3);
1429     qq31             = _mm_mul_ps(iq3,jq1);
1430     qq32             = _mm_mul_ps(iq3,jq2);
1431     qq33             = _mm_mul_ps(iq3,jq3);
1432
1433     /* Avoid stupid compiler warnings */
1434     jnrA = jnrB = jnrC = jnrD = 0;
1435     j_coord_offsetA = 0;
1436     j_coord_offsetB = 0;
1437     j_coord_offsetC = 0;
1438     j_coord_offsetD = 0;
1439
1440     outeriter        = 0;
1441     inneriter        = 0;
1442
1443     for(iidx=0;iidx<4*DIM;iidx++)
1444     {
1445         scratch[iidx] = 0.0;
1446     }
1447
1448     /* Start outer loop over neighborlists */
1449     for(iidx=0; iidx<nri; iidx++)
1450     {
1451         /* Load shift vector for this list */
1452         i_shift_offset   = DIM*shiftidx[iidx];
1453
1454         /* Load limits for loop over neighbors */
1455         j_index_start    = jindex[iidx];
1456         j_index_end      = jindex[iidx+1];
1457
1458         /* Get outer coordinate index */
1459         inr              = iinr[iidx];
1460         i_coord_offset   = DIM*inr;
1461
1462         /* Load i particle coords and add shift vector */
1463         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1464                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1465
1466         fix0             = _mm_setzero_ps();
1467         fiy0             = _mm_setzero_ps();
1468         fiz0             = _mm_setzero_ps();
1469         fix1             = _mm_setzero_ps();
1470         fiy1             = _mm_setzero_ps();
1471         fiz1             = _mm_setzero_ps();
1472         fix2             = _mm_setzero_ps();
1473         fiy2             = _mm_setzero_ps();
1474         fiz2             = _mm_setzero_ps();
1475         fix3             = _mm_setzero_ps();
1476         fiy3             = _mm_setzero_ps();
1477         fiz3             = _mm_setzero_ps();
1478
1479         /* Start inner kernel loop */
1480         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1481         {
1482
1483             /* Get j neighbor index, and coordinate index */
1484             jnrA             = jjnr[jidx];
1485             jnrB             = jjnr[jidx+1];
1486             jnrC             = jjnr[jidx+2];
1487             jnrD             = jjnr[jidx+3];
1488             j_coord_offsetA  = DIM*jnrA;
1489             j_coord_offsetB  = DIM*jnrB;
1490             j_coord_offsetC  = DIM*jnrC;
1491             j_coord_offsetD  = DIM*jnrD;
1492
1493             /* load j atom coordinates */
1494             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1495                                               x+j_coord_offsetC,x+j_coord_offsetD,
1496                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1497                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1498
1499             /* Calculate displacement vector */
1500             dx00             = _mm_sub_ps(ix0,jx0);
1501             dy00             = _mm_sub_ps(iy0,jy0);
1502             dz00             = _mm_sub_ps(iz0,jz0);
1503             dx11             = _mm_sub_ps(ix1,jx1);
1504             dy11             = _mm_sub_ps(iy1,jy1);
1505             dz11             = _mm_sub_ps(iz1,jz1);
1506             dx12             = _mm_sub_ps(ix1,jx2);
1507             dy12             = _mm_sub_ps(iy1,jy2);
1508             dz12             = _mm_sub_ps(iz1,jz2);
1509             dx13             = _mm_sub_ps(ix1,jx3);
1510             dy13             = _mm_sub_ps(iy1,jy3);
1511             dz13             = _mm_sub_ps(iz1,jz3);
1512             dx21             = _mm_sub_ps(ix2,jx1);
1513             dy21             = _mm_sub_ps(iy2,jy1);
1514             dz21             = _mm_sub_ps(iz2,jz1);
1515             dx22             = _mm_sub_ps(ix2,jx2);
1516             dy22             = _mm_sub_ps(iy2,jy2);
1517             dz22             = _mm_sub_ps(iz2,jz2);
1518             dx23             = _mm_sub_ps(ix2,jx3);
1519             dy23             = _mm_sub_ps(iy2,jy3);
1520             dz23             = _mm_sub_ps(iz2,jz3);
1521             dx31             = _mm_sub_ps(ix3,jx1);
1522             dy31             = _mm_sub_ps(iy3,jy1);
1523             dz31             = _mm_sub_ps(iz3,jz1);
1524             dx32             = _mm_sub_ps(ix3,jx2);
1525             dy32             = _mm_sub_ps(iy3,jy2);
1526             dz32             = _mm_sub_ps(iz3,jz2);
1527             dx33             = _mm_sub_ps(ix3,jx3);
1528             dy33             = _mm_sub_ps(iy3,jy3);
1529             dz33             = _mm_sub_ps(iz3,jz3);
1530
1531             /* Calculate squared distance and things based on it */
1532             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1533             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1534             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1535             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1536             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1537             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1538             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1539             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1540             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1541             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1542
1543             rinv11           = avx128fma_invsqrt_f(rsq11);
1544             rinv12           = avx128fma_invsqrt_f(rsq12);
1545             rinv13           = avx128fma_invsqrt_f(rsq13);
1546             rinv21           = avx128fma_invsqrt_f(rsq21);
1547             rinv22           = avx128fma_invsqrt_f(rsq22);
1548             rinv23           = avx128fma_invsqrt_f(rsq23);
1549             rinv31           = avx128fma_invsqrt_f(rsq31);
1550             rinv32           = avx128fma_invsqrt_f(rsq32);
1551             rinv33           = avx128fma_invsqrt_f(rsq33);
1552
1553             rinvsq00         = avx128fma_inv_f(rsq00);
1554
1555             fjx0             = _mm_setzero_ps();
1556             fjy0             = _mm_setzero_ps();
1557             fjz0             = _mm_setzero_ps();
1558             fjx1             = _mm_setzero_ps();
1559             fjy1             = _mm_setzero_ps();
1560             fjz1             = _mm_setzero_ps();
1561             fjx2             = _mm_setzero_ps();
1562             fjy2             = _mm_setzero_ps();
1563             fjz2             = _mm_setzero_ps();
1564             fjx3             = _mm_setzero_ps();
1565             fjy3             = _mm_setzero_ps();
1566             fjz3             = _mm_setzero_ps();
1567
1568             /**************************
1569              * CALCULATE INTERACTIONS *
1570              **************************/
1571
1572             /* LENNARD-JONES DISPERSION/REPULSION */
1573
1574             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1575             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1576
1577             fscal            = fvdw;
1578
1579              /* Update vectorial force */
1580             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1581             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1582             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1583
1584             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1585             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1586             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1587
1588             /**************************
1589              * CALCULATE INTERACTIONS *
1590              **************************/
1591
1592             r11              = _mm_mul_ps(rsq11,rinv11);
1593
1594             /* Calculate table index by multiplying r with table scale and truncate to integer */
1595             rt               = _mm_mul_ps(r11,vftabscale);
1596             vfitab           = _mm_cvttps_epi32(rt);
1597 #ifdef __XOP__
1598             vfeps            = _mm_frcz_ps(rt);
1599 #else
1600             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1601 #endif
1602             twovfeps         = _mm_add_ps(vfeps,vfeps);
1603             vfitab           = _mm_slli_epi32(vfitab,2);
1604
1605             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1606             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1607             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1608             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1609             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1610             _MM_TRANSPOSE4_PS(Y,F,G,H);
1611             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1612             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1613             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1614
1615             fscal            = felec;
1616
1617              /* Update vectorial force */
1618             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1619             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1620             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1621
1622             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1623             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1624             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1625
1626             /**************************
1627              * CALCULATE INTERACTIONS *
1628              **************************/
1629
1630             r12              = _mm_mul_ps(rsq12,rinv12);
1631
1632             /* Calculate table index by multiplying r with table scale and truncate to integer */
1633             rt               = _mm_mul_ps(r12,vftabscale);
1634             vfitab           = _mm_cvttps_epi32(rt);
1635 #ifdef __XOP__
1636             vfeps            = _mm_frcz_ps(rt);
1637 #else
1638             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1639 #endif
1640             twovfeps         = _mm_add_ps(vfeps,vfeps);
1641             vfitab           = _mm_slli_epi32(vfitab,2);
1642
1643             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1644             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1645             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1646             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1647             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1648             _MM_TRANSPOSE4_PS(Y,F,G,H);
1649             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1650             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1651             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1652
1653             fscal            = felec;
1654
1655              /* Update vectorial force */
1656             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1657             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1658             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1659
1660             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1661             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1662             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1663
1664             /**************************
1665              * CALCULATE INTERACTIONS *
1666              **************************/
1667
1668             r13              = _mm_mul_ps(rsq13,rinv13);
1669
1670             /* Calculate table index by multiplying r with table scale and truncate to integer */
1671             rt               = _mm_mul_ps(r13,vftabscale);
1672             vfitab           = _mm_cvttps_epi32(rt);
1673 #ifdef __XOP__
1674             vfeps            = _mm_frcz_ps(rt);
1675 #else
1676             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1677 #endif
1678             twovfeps         = _mm_add_ps(vfeps,vfeps);
1679             vfitab           = _mm_slli_epi32(vfitab,2);
1680
1681             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1682             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1683             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1684             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1685             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1686             _MM_TRANSPOSE4_PS(Y,F,G,H);
1687             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1688             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1689             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1690
1691             fscal            = felec;
1692
1693              /* Update vectorial force */
1694             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1695             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1696             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1697
1698             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1699             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1700             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1701
1702             /**************************
1703              * CALCULATE INTERACTIONS *
1704              **************************/
1705
1706             r21              = _mm_mul_ps(rsq21,rinv21);
1707
1708             /* Calculate table index by multiplying r with table scale and truncate to integer */
1709             rt               = _mm_mul_ps(r21,vftabscale);
1710             vfitab           = _mm_cvttps_epi32(rt);
1711 #ifdef __XOP__
1712             vfeps            = _mm_frcz_ps(rt);
1713 #else
1714             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1715 #endif
1716             twovfeps         = _mm_add_ps(vfeps,vfeps);
1717             vfitab           = _mm_slli_epi32(vfitab,2);
1718
1719             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1720             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1721             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1722             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1723             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1724             _MM_TRANSPOSE4_PS(Y,F,G,H);
1725             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1726             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1727             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1728
1729             fscal            = felec;
1730
1731              /* Update vectorial force */
1732             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1733             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1734             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1735
1736             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1737             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1738             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1739
1740             /**************************
1741              * CALCULATE INTERACTIONS *
1742              **************************/
1743
1744             r22              = _mm_mul_ps(rsq22,rinv22);
1745
1746             /* Calculate table index by multiplying r with table scale and truncate to integer */
1747             rt               = _mm_mul_ps(r22,vftabscale);
1748             vfitab           = _mm_cvttps_epi32(rt);
1749 #ifdef __XOP__
1750             vfeps            = _mm_frcz_ps(rt);
1751 #else
1752             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1753 #endif
1754             twovfeps         = _mm_add_ps(vfeps,vfeps);
1755             vfitab           = _mm_slli_epi32(vfitab,2);
1756
1757             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1758             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1759             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1760             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1761             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1762             _MM_TRANSPOSE4_PS(Y,F,G,H);
1763             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1764             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1765             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1766
1767             fscal            = felec;
1768
1769              /* Update vectorial force */
1770             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1771             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1772             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1773
1774             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1775             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1776             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1777
1778             /**************************
1779              * CALCULATE INTERACTIONS *
1780              **************************/
1781
1782             r23              = _mm_mul_ps(rsq23,rinv23);
1783
1784             /* Calculate table index by multiplying r with table scale and truncate to integer */
1785             rt               = _mm_mul_ps(r23,vftabscale);
1786             vfitab           = _mm_cvttps_epi32(rt);
1787 #ifdef __XOP__
1788             vfeps            = _mm_frcz_ps(rt);
1789 #else
1790             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1791 #endif
1792             twovfeps         = _mm_add_ps(vfeps,vfeps);
1793             vfitab           = _mm_slli_epi32(vfitab,2);
1794
1795             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1796             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1797             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1798             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1799             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1800             _MM_TRANSPOSE4_PS(Y,F,G,H);
1801             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1802             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1803             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1804
1805             fscal            = felec;
1806
1807              /* Update vectorial force */
1808             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1809             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1810             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1811
1812             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1813             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1814             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1815
1816             /**************************
1817              * CALCULATE INTERACTIONS *
1818              **************************/
1819
1820             r31              = _mm_mul_ps(rsq31,rinv31);
1821
1822             /* Calculate table index by multiplying r with table scale and truncate to integer */
1823             rt               = _mm_mul_ps(r31,vftabscale);
1824             vfitab           = _mm_cvttps_epi32(rt);
1825 #ifdef __XOP__
1826             vfeps            = _mm_frcz_ps(rt);
1827 #else
1828             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1829 #endif
1830             twovfeps         = _mm_add_ps(vfeps,vfeps);
1831             vfitab           = _mm_slli_epi32(vfitab,2);
1832
1833             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1834             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1835             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1836             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1837             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1838             _MM_TRANSPOSE4_PS(Y,F,G,H);
1839             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1840             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1841             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1842
1843             fscal            = felec;
1844
1845              /* Update vectorial force */
1846             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1847             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1848             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1849
1850             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1851             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1852             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1853
1854             /**************************
1855              * CALCULATE INTERACTIONS *
1856              **************************/
1857
1858             r32              = _mm_mul_ps(rsq32,rinv32);
1859
1860             /* Calculate table index by multiplying r with table scale and truncate to integer */
1861             rt               = _mm_mul_ps(r32,vftabscale);
1862             vfitab           = _mm_cvttps_epi32(rt);
1863 #ifdef __XOP__
1864             vfeps            = _mm_frcz_ps(rt);
1865 #else
1866             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1867 #endif
1868             twovfeps         = _mm_add_ps(vfeps,vfeps);
1869             vfitab           = _mm_slli_epi32(vfitab,2);
1870
1871             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1872             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1873             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1874             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1875             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1876             _MM_TRANSPOSE4_PS(Y,F,G,H);
1877             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1878             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1879             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1880
1881             fscal            = felec;
1882
1883              /* Update vectorial force */
1884             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1885             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1886             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1887
1888             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1889             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1890             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1891
1892             /**************************
1893              * CALCULATE INTERACTIONS *
1894              **************************/
1895
1896             r33              = _mm_mul_ps(rsq33,rinv33);
1897
1898             /* Calculate table index by multiplying r with table scale and truncate to integer */
1899             rt               = _mm_mul_ps(r33,vftabscale);
1900             vfitab           = _mm_cvttps_epi32(rt);
1901 #ifdef __XOP__
1902             vfeps            = _mm_frcz_ps(rt);
1903 #else
1904             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1905 #endif
1906             twovfeps         = _mm_add_ps(vfeps,vfeps);
1907             vfitab           = _mm_slli_epi32(vfitab,2);
1908
1909             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1910             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1911             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1912             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1913             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1914             _MM_TRANSPOSE4_PS(Y,F,G,H);
1915             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1916             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1917             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1918
1919             fscal            = felec;
1920
1921              /* Update vectorial force */
1922             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1923             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1924             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1925
1926             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1927             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1928             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1929
1930             fjptrA             = f+j_coord_offsetA;
1931             fjptrB             = f+j_coord_offsetB;
1932             fjptrC             = f+j_coord_offsetC;
1933             fjptrD             = f+j_coord_offsetD;
1934
1935             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1936                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1937                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1938
1939             /* Inner loop uses 411 flops */
1940         }
1941
1942         if(jidx<j_index_end)
1943         {
1944
1945             /* Get j neighbor index, and coordinate index */
1946             jnrlistA         = jjnr[jidx];
1947             jnrlistB         = jjnr[jidx+1];
1948             jnrlistC         = jjnr[jidx+2];
1949             jnrlistD         = jjnr[jidx+3];
1950             /* Sign of each element will be negative for non-real atoms.
1951              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1952              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1953              */
1954             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1955             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1956             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1957             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1958             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1959             j_coord_offsetA  = DIM*jnrA;
1960             j_coord_offsetB  = DIM*jnrB;
1961             j_coord_offsetC  = DIM*jnrC;
1962             j_coord_offsetD  = DIM*jnrD;
1963
1964             /* load j atom coordinates */
1965             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1966                                               x+j_coord_offsetC,x+j_coord_offsetD,
1967                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1968                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1969
1970             /* Calculate displacement vector */
1971             dx00             = _mm_sub_ps(ix0,jx0);
1972             dy00             = _mm_sub_ps(iy0,jy0);
1973             dz00             = _mm_sub_ps(iz0,jz0);
1974             dx11             = _mm_sub_ps(ix1,jx1);
1975             dy11             = _mm_sub_ps(iy1,jy1);
1976             dz11             = _mm_sub_ps(iz1,jz1);
1977             dx12             = _mm_sub_ps(ix1,jx2);
1978             dy12             = _mm_sub_ps(iy1,jy2);
1979             dz12             = _mm_sub_ps(iz1,jz2);
1980             dx13             = _mm_sub_ps(ix1,jx3);
1981             dy13             = _mm_sub_ps(iy1,jy3);
1982             dz13             = _mm_sub_ps(iz1,jz3);
1983             dx21             = _mm_sub_ps(ix2,jx1);
1984             dy21             = _mm_sub_ps(iy2,jy1);
1985             dz21             = _mm_sub_ps(iz2,jz1);
1986             dx22             = _mm_sub_ps(ix2,jx2);
1987             dy22             = _mm_sub_ps(iy2,jy2);
1988             dz22             = _mm_sub_ps(iz2,jz2);
1989             dx23             = _mm_sub_ps(ix2,jx3);
1990             dy23             = _mm_sub_ps(iy2,jy3);
1991             dz23             = _mm_sub_ps(iz2,jz3);
1992             dx31             = _mm_sub_ps(ix3,jx1);
1993             dy31             = _mm_sub_ps(iy3,jy1);
1994             dz31             = _mm_sub_ps(iz3,jz1);
1995             dx32             = _mm_sub_ps(ix3,jx2);
1996             dy32             = _mm_sub_ps(iy3,jy2);
1997             dz32             = _mm_sub_ps(iz3,jz2);
1998             dx33             = _mm_sub_ps(ix3,jx3);
1999             dy33             = _mm_sub_ps(iy3,jy3);
2000             dz33             = _mm_sub_ps(iz3,jz3);
2001
2002             /* Calculate squared distance and things based on it */
2003             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2004             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2005             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2006             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2007             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2008             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2009             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2010             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2011             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2012             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2013
2014             rinv11           = avx128fma_invsqrt_f(rsq11);
2015             rinv12           = avx128fma_invsqrt_f(rsq12);
2016             rinv13           = avx128fma_invsqrt_f(rsq13);
2017             rinv21           = avx128fma_invsqrt_f(rsq21);
2018             rinv22           = avx128fma_invsqrt_f(rsq22);
2019             rinv23           = avx128fma_invsqrt_f(rsq23);
2020             rinv31           = avx128fma_invsqrt_f(rsq31);
2021             rinv32           = avx128fma_invsqrt_f(rsq32);
2022             rinv33           = avx128fma_invsqrt_f(rsq33);
2023
2024             rinvsq00         = avx128fma_inv_f(rsq00);
2025
2026             fjx0             = _mm_setzero_ps();
2027             fjy0             = _mm_setzero_ps();
2028             fjz0             = _mm_setzero_ps();
2029             fjx1             = _mm_setzero_ps();
2030             fjy1             = _mm_setzero_ps();
2031             fjz1             = _mm_setzero_ps();
2032             fjx2             = _mm_setzero_ps();
2033             fjy2             = _mm_setzero_ps();
2034             fjz2             = _mm_setzero_ps();
2035             fjx3             = _mm_setzero_ps();
2036             fjy3             = _mm_setzero_ps();
2037             fjz3             = _mm_setzero_ps();
2038
2039             /**************************
2040              * CALCULATE INTERACTIONS *
2041              **************************/
2042
2043             /* LENNARD-JONES DISPERSION/REPULSION */
2044
2045             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2046             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2047
2048             fscal            = fvdw;
2049
2050             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2051
2052              /* Update vectorial force */
2053             fix0             = _mm_macc_ps(dx00,fscal,fix0);
2054             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
2055             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
2056
2057             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
2058             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
2059             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
2060
2061             /**************************
2062              * CALCULATE INTERACTIONS *
2063              **************************/
2064
2065             r11              = _mm_mul_ps(rsq11,rinv11);
2066             r11              = _mm_andnot_ps(dummy_mask,r11);
2067
2068             /* Calculate table index by multiplying r with table scale and truncate to integer */
2069             rt               = _mm_mul_ps(r11,vftabscale);
2070             vfitab           = _mm_cvttps_epi32(rt);
2071 #ifdef __XOP__
2072             vfeps            = _mm_frcz_ps(rt);
2073 #else
2074             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2075 #endif
2076             twovfeps         = _mm_add_ps(vfeps,vfeps);
2077             vfitab           = _mm_slli_epi32(vfitab,2);
2078
2079             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2080             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2081             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2082             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2083             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2084             _MM_TRANSPOSE4_PS(Y,F,G,H);
2085             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2086             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2087             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2088
2089             fscal            = felec;
2090
2091             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2092
2093              /* Update vectorial force */
2094             fix1             = _mm_macc_ps(dx11,fscal,fix1);
2095             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
2096             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
2097
2098             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
2099             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
2100             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
2101
2102             /**************************
2103              * CALCULATE INTERACTIONS *
2104              **************************/
2105
2106             r12              = _mm_mul_ps(rsq12,rinv12);
2107             r12              = _mm_andnot_ps(dummy_mask,r12);
2108
2109             /* Calculate table index by multiplying r with table scale and truncate to integer */
2110             rt               = _mm_mul_ps(r12,vftabscale);
2111             vfitab           = _mm_cvttps_epi32(rt);
2112 #ifdef __XOP__
2113             vfeps            = _mm_frcz_ps(rt);
2114 #else
2115             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2116 #endif
2117             twovfeps         = _mm_add_ps(vfeps,vfeps);
2118             vfitab           = _mm_slli_epi32(vfitab,2);
2119
2120             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2121             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2122             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2123             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2124             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2125             _MM_TRANSPOSE4_PS(Y,F,G,H);
2126             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2127             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2128             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2129
2130             fscal            = felec;
2131
2132             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2133
2134              /* Update vectorial force */
2135             fix1             = _mm_macc_ps(dx12,fscal,fix1);
2136             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
2137             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
2138
2139             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
2140             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
2141             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
2142
2143             /**************************
2144              * CALCULATE INTERACTIONS *
2145              **************************/
2146
2147             r13              = _mm_mul_ps(rsq13,rinv13);
2148             r13              = _mm_andnot_ps(dummy_mask,r13);
2149
2150             /* Calculate table index by multiplying r with table scale and truncate to integer */
2151             rt               = _mm_mul_ps(r13,vftabscale);
2152             vfitab           = _mm_cvttps_epi32(rt);
2153 #ifdef __XOP__
2154             vfeps            = _mm_frcz_ps(rt);
2155 #else
2156             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2157 #endif
2158             twovfeps         = _mm_add_ps(vfeps,vfeps);
2159             vfitab           = _mm_slli_epi32(vfitab,2);
2160
2161             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2162             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2163             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2164             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2165             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2166             _MM_TRANSPOSE4_PS(Y,F,G,H);
2167             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2168             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2169             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
2170
2171             fscal            = felec;
2172
2173             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2174
2175              /* Update vectorial force */
2176             fix1             = _mm_macc_ps(dx13,fscal,fix1);
2177             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
2178             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
2179
2180             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
2181             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
2182             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
2183
2184             /**************************
2185              * CALCULATE INTERACTIONS *
2186              **************************/
2187
2188             r21              = _mm_mul_ps(rsq21,rinv21);
2189             r21              = _mm_andnot_ps(dummy_mask,r21);
2190
2191             /* Calculate table index by multiplying r with table scale and truncate to integer */
2192             rt               = _mm_mul_ps(r21,vftabscale);
2193             vfitab           = _mm_cvttps_epi32(rt);
2194 #ifdef __XOP__
2195             vfeps            = _mm_frcz_ps(rt);
2196 #else
2197             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2198 #endif
2199             twovfeps         = _mm_add_ps(vfeps,vfeps);
2200             vfitab           = _mm_slli_epi32(vfitab,2);
2201
2202             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2203             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2204             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2205             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2206             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2207             _MM_TRANSPOSE4_PS(Y,F,G,H);
2208             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2209             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2210             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2211
2212             fscal            = felec;
2213
2214             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2215
2216              /* Update vectorial force */
2217             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2218             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2219             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2220
2221             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2222             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2223             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2224
2225             /**************************
2226              * CALCULATE INTERACTIONS *
2227              **************************/
2228
2229             r22              = _mm_mul_ps(rsq22,rinv22);
2230             r22              = _mm_andnot_ps(dummy_mask,r22);
2231
2232             /* Calculate table index by multiplying r with table scale and truncate to integer */
2233             rt               = _mm_mul_ps(r22,vftabscale);
2234             vfitab           = _mm_cvttps_epi32(rt);
2235 #ifdef __XOP__
2236             vfeps            = _mm_frcz_ps(rt);
2237 #else
2238             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2239 #endif
2240             twovfeps         = _mm_add_ps(vfeps,vfeps);
2241             vfitab           = _mm_slli_epi32(vfitab,2);
2242
2243             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2244             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2245             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2246             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2247             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2248             _MM_TRANSPOSE4_PS(Y,F,G,H);
2249             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2250             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2251             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2252
2253             fscal            = felec;
2254
2255             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2256
2257              /* Update vectorial force */
2258             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2259             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2260             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2261
2262             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2263             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2264             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2265
2266             /**************************
2267              * CALCULATE INTERACTIONS *
2268              **************************/
2269
2270             r23              = _mm_mul_ps(rsq23,rinv23);
2271             r23              = _mm_andnot_ps(dummy_mask,r23);
2272
2273             /* Calculate table index by multiplying r with table scale and truncate to integer */
2274             rt               = _mm_mul_ps(r23,vftabscale);
2275             vfitab           = _mm_cvttps_epi32(rt);
2276 #ifdef __XOP__
2277             vfeps            = _mm_frcz_ps(rt);
2278 #else
2279             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2280 #endif
2281             twovfeps         = _mm_add_ps(vfeps,vfeps);
2282             vfitab           = _mm_slli_epi32(vfitab,2);
2283
2284             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2285             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2286             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2287             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2288             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2289             _MM_TRANSPOSE4_PS(Y,F,G,H);
2290             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2291             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2292             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
2293
2294             fscal            = felec;
2295
2296             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2297
2298              /* Update vectorial force */
2299             fix2             = _mm_macc_ps(dx23,fscal,fix2);
2300             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
2301             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
2302
2303             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
2304             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
2305             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
2306
2307             /**************************
2308              * CALCULATE INTERACTIONS *
2309              **************************/
2310
2311             r31              = _mm_mul_ps(rsq31,rinv31);
2312             r31              = _mm_andnot_ps(dummy_mask,r31);
2313
2314             /* Calculate table index by multiplying r with table scale and truncate to integer */
2315             rt               = _mm_mul_ps(r31,vftabscale);
2316             vfitab           = _mm_cvttps_epi32(rt);
2317 #ifdef __XOP__
2318             vfeps            = _mm_frcz_ps(rt);
2319 #else
2320             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2321 #endif
2322             twovfeps         = _mm_add_ps(vfeps,vfeps);
2323             vfitab           = _mm_slli_epi32(vfitab,2);
2324
2325             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2326             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2327             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2328             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2329             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2330             _MM_TRANSPOSE4_PS(Y,F,G,H);
2331             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2332             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2333             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
2334
2335             fscal            = felec;
2336
2337             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2338
2339              /* Update vectorial force */
2340             fix3             = _mm_macc_ps(dx31,fscal,fix3);
2341             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
2342             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
2343
2344             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
2345             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
2346             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
2347
2348             /**************************
2349              * CALCULATE INTERACTIONS *
2350              **************************/
2351
2352             r32              = _mm_mul_ps(rsq32,rinv32);
2353             r32              = _mm_andnot_ps(dummy_mask,r32);
2354
2355             /* Calculate table index by multiplying r with table scale and truncate to integer */
2356             rt               = _mm_mul_ps(r32,vftabscale);
2357             vfitab           = _mm_cvttps_epi32(rt);
2358 #ifdef __XOP__
2359             vfeps            = _mm_frcz_ps(rt);
2360 #else
2361             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2362 #endif
2363             twovfeps         = _mm_add_ps(vfeps,vfeps);
2364             vfitab           = _mm_slli_epi32(vfitab,2);
2365
2366             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2367             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2368             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2369             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2370             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2371             _MM_TRANSPOSE4_PS(Y,F,G,H);
2372             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2373             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2374             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
2375
2376             fscal            = felec;
2377
2378             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2379
2380              /* Update vectorial force */
2381             fix3             = _mm_macc_ps(dx32,fscal,fix3);
2382             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
2383             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
2384
2385             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
2386             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
2387             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
2388
2389             /**************************
2390              * CALCULATE INTERACTIONS *
2391              **************************/
2392
2393             r33              = _mm_mul_ps(rsq33,rinv33);
2394             r33              = _mm_andnot_ps(dummy_mask,r33);
2395
2396             /* Calculate table index by multiplying r with table scale and truncate to integer */
2397             rt               = _mm_mul_ps(r33,vftabscale);
2398             vfitab           = _mm_cvttps_epi32(rt);
2399 #ifdef __XOP__
2400             vfeps            = _mm_frcz_ps(rt);
2401 #else
2402             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2403 #endif
2404             twovfeps         = _mm_add_ps(vfeps,vfeps);
2405             vfitab           = _mm_slli_epi32(vfitab,2);
2406
2407             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2408             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2409             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2410             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2411             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2412             _MM_TRANSPOSE4_PS(Y,F,G,H);
2413             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2414             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2415             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2416
2417             fscal            = felec;
2418
2419             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2420
2421              /* Update vectorial force */
2422             fix3             = _mm_macc_ps(dx33,fscal,fix3);
2423             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
2424             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
2425
2426             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
2427             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
2428             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
2429
2430             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2431             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2432             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2433             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2434
2435             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2436                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2437                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2438
2439             /* Inner loop uses 420 flops */
2440         }
2441
2442         /* End of innermost loop */
2443
2444         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2445                                               f+i_coord_offset,fshift+i_shift_offset);
2446
2447         /* Increment number of inner iterations */
2448         inneriter                  += j_index_end - j_index_start;
2449
2450         /* Outer loop uses 24 flops */
2451     }
2452
2453     /* Increment number of outer iterations */
2454     outeriter        += nri;
2455
2456     /* Update outer/inner flops */
2457
2458     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*420);
2459 }