Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecCSTab_VdwNone_GeomW3W3_avx_128_fma_single.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_128_fma_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_128_fma_single
51  * Electrostatics interaction: CubicSplineTable
52  * VdW interaction:            None
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_128_fma_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80     real             scratch[4*DIM];
81     __m128           fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
104     real             *charge;
105     __m128i          vfitab;
106     __m128i          ifour       = _mm_set1_epi32(4);
107     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
108     real             *vftab;
109     __m128           dummy_mask,cutoff_mask;
110     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
111     __m128           one     = _mm_set1_ps(1.0);
112     __m128           two     = _mm_set1_ps(2.0);
113     x                = xx[0];
114     f                = ff[0];
115
116     nri              = nlist->nri;
117     iinr             = nlist->iinr;
118     jindex           = nlist->jindex;
119     jjnr             = nlist->jjnr;
120     shiftidx         = nlist->shift;
121     gid              = nlist->gid;
122     shiftvec         = fr->shift_vec[0];
123     fshift           = fr->fshift[0];
124     facel            = _mm_set1_ps(fr->ic->epsfac);
125     charge           = mdatoms->chargeA;
126
127     vftab            = kernel_data->table_elec->data;
128     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
129
130     /* Setup water-specific parameters */
131     inr              = nlist->iinr[0];
132     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
133     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
134     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
135
136     jq0              = _mm_set1_ps(charge[inr+0]);
137     jq1              = _mm_set1_ps(charge[inr+1]);
138     jq2              = _mm_set1_ps(charge[inr+2]);
139     qq00             = _mm_mul_ps(iq0,jq0);
140     qq01             = _mm_mul_ps(iq0,jq1);
141     qq02             = _mm_mul_ps(iq0,jq2);
142     qq10             = _mm_mul_ps(iq1,jq0);
143     qq11             = _mm_mul_ps(iq1,jq1);
144     qq12             = _mm_mul_ps(iq1,jq2);
145     qq20             = _mm_mul_ps(iq2,jq0);
146     qq21             = _mm_mul_ps(iq2,jq1);
147     qq22             = _mm_mul_ps(iq2,jq2);
148
149     /* Avoid stupid compiler warnings */
150     jnrA = jnrB = jnrC = jnrD = 0;
151     j_coord_offsetA = 0;
152     j_coord_offsetB = 0;
153     j_coord_offsetC = 0;
154     j_coord_offsetD = 0;
155
156     outeriter        = 0;
157     inneriter        = 0;
158
159     for(iidx=0;iidx<4*DIM;iidx++)
160     {
161         scratch[iidx] = 0.0;
162     }
163
164     /* Start outer loop over neighborlists */
165     for(iidx=0; iidx<nri; iidx++)
166     {
167         /* Load shift vector for this list */
168         i_shift_offset   = DIM*shiftidx[iidx];
169
170         /* Load limits for loop over neighbors */
171         j_index_start    = jindex[iidx];
172         j_index_end      = jindex[iidx+1];
173
174         /* Get outer coordinate index */
175         inr              = iinr[iidx];
176         i_coord_offset   = DIM*inr;
177
178         /* Load i particle coords and add shift vector */
179         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
180                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
181
182         fix0             = _mm_setzero_ps();
183         fiy0             = _mm_setzero_ps();
184         fiz0             = _mm_setzero_ps();
185         fix1             = _mm_setzero_ps();
186         fiy1             = _mm_setzero_ps();
187         fiz1             = _mm_setzero_ps();
188         fix2             = _mm_setzero_ps();
189         fiy2             = _mm_setzero_ps();
190         fiz2             = _mm_setzero_ps();
191
192         /* Reset potential sums */
193         velecsum         = _mm_setzero_ps();
194
195         /* Start inner kernel loop */
196         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
197         {
198
199             /* Get j neighbor index, and coordinate index */
200             jnrA             = jjnr[jidx];
201             jnrB             = jjnr[jidx+1];
202             jnrC             = jjnr[jidx+2];
203             jnrD             = jjnr[jidx+3];
204             j_coord_offsetA  = DIM*jnrA;
205             j_coord_offsetB  = DIM*jnrB;
206             j_coord_offsetC  = DIM*jnrC;
207             j_coord_offsetD  = DIM*jnrD;
208
209             /* load j atom coordinates */
210             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
211                                               x+j_coord_offsetC,x+j_coord_offsetD,
212                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
213
214             /* Calculate displacement vector */
215             dx00             = _mm_sub_ps(ix0,jx0);
216             dy00             = _mm_sub_ps(iy0,jy0);
217             dz00             = _mm_sub_ps(iz0,jz0);
218             dx01             = _mm_sub_ps(ix0,jx1);
219             dy01             = _mm_sub_ps(iy0,jy1);
220             dz01             = _mm_sub_ps(iz0,jz1);
221             dx02             = _mm_sub_ps(ix0,jx2);
222             dy02             = _mm_sub_ps(iy0,jy2);
223             dz02             = _mm_sub_ps(iz0,jz2);
224             dx10             = _mm_sub_ps(ix1,jx0);
225             dy10             = _mm_sub_ps(iy1,jy0);
226             dz10             = _mm_sub_ps(iz1,jz0);
227             dx11             = _mm_sub_ps(ix1,jx1);
228             dy11             = _mm_sub_ps(iy1,jy1);
229             dz11             = _mm_sub_ps(iz1,jz1);
230             dx12             = _mm_sub_ps(ix1,jx2);
231             dy12             = _mm_sub_ps(iy1,jy2);
232             dz12             = _mm_sub_ps(iz1,jz2);
233             dx20             = _mm_sub_ps(ix2,jx0);
234             dy20             = _mm_sub_ps(iy2,jy0);
235             dz20             = _mm_sub_ps(iz2,jz0);
236             dx21             = _mm_sub_ps(ix2,jx1);
237             dy21             = _mm_sub_ps(iy2,jy1);
238             dz21             = _mm_sub_ps(iz2,jz1);
239             dx22             = _mm_sub_ps(ix2,jx2);
240             dy22             = _mm_sub_ps(iy2,jy2);
241             dz22             = _mm_sub_ps(iz2,jz2);
242
243             /* Calculate squared distance and things based on it */
244             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
245             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
246             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
247             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
248             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
249             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
250             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
251             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
252             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
253
254             rinv00           = avx128fma_invsqrt_f(rsq00);
255             rinv01           = avx128fma_invsqrt_f(rsq01);
256             rinv02           = avx128fma_invsqrt_f(rsq02);
257             rinv10           = avx128fma_invsqrt_f(rsq10);
258             rinv11           = avx128fma_invsqrt_f(rsq11);
259             rinv12           = avx128fma_invsqrt_f(rsq12);
260             rinv20           = avx128fma_invsqrt_f(rsq20);
261             rinv21           = avx128fma_invsqrt_f(rsq21);
262             rinv22           = avx128fma_invsqrt_f(rsq22);
263
264             fjx0             = _mm_setzero_ps();
265             fjy0             = _mm_setzero_ps();
266             fjz0             = _mm_setzero_ps();
267             fjx1             = _mm_setzero_ps();
268             fjy1             = _mm_setzero_ps();
269             fjz1             = _mm_setzero_ps();
270             fjx2             = _mm_setzero_ps();
271             fjy2             = _mm_setzero_ps();
272             fjz2             = _mm_setzero_ps();
273
274             /**************************
275              * CALCULATE INTERACTIONS *
276              **************************/
277
278             r00              = _mm_mul_ps(rsq00,rinv00);
279
280             /* Calculate table index by multiplying r with table scale and truncate to integer */
281             rt               = _mm_mul_ps(r00,vftabscale);
282             vfitab           = _mm_cvttps_epi32(rt);
283 #ifdef __XOP__
284             vfeps            = _mm_frcz_ps(rt);
285 #else
286             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
287 #endif
288             twovfeps         = _mm_add_ps(vfeps,vfeps);
289             vfitab           = _mm_slli_epi32(vfitab,2);
290
291             /* CUBIC SPLINE TABLE ELECTROSTATICS */
292             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
293             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
294             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
295             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
296             _MM_TRANSPOSE4_PS(Y,F,G,H);
297             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
298             VV               = _mm_macc_ps(vfeps,Fp,Y);
299             velec            = _mm_mul_ps(qq00,VV);
300             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
301             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
302
303             /* Update potential sum for this i atom from the interaction with this j atom. */
304             velecsum         = _mm_add_ps(velecsum,velec);
305
306             fscal            = felec;
307
308              /* Update vectorial force */
309             fix0             = _mm_macc_ps(dx00,fscal,fix0);
310             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
311             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
312
313             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
314             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
315             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
316
317             /**************************
318              * CALCULATE INTERACTIONS *
319              **************************/
320
321             r01              = _mm_mul_ps(rsq01,rinv01);
322
323             /* Calculate table index by multiplying r with table scale and truncate to integer */
324             rt               = _mm_mul_ps(r01,vftabscale);
325             vfitab           = _mm_cvttps_epi32(rt);
326 #ifdef __XOP__
327             vfeps            = _mm_frcz_ps(rt);
328 #else
329             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
330 #endif
331             twovfeps         = _mm_add_ps(vfeps,vfeps);
332             vfitab           = _mm_slli_epi32(vfitab,2);
333
334             /* CUBIC SPLINE TABLE ELECTROSTATICS */
335             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
336             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
337             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
338             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
339             _MM_TRANSPOSE4_PS(Y,F,G,H);
340             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
341             VV               = _mm_macc_ps(vfeps,Fp,Y);
342             velec            = _mm_mul_ps(qq01,VV);
343             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
344             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
345
346             /* Update potential sum for this i atom from the interaction with this j atom. */
347             velecsum         = _mm_add_ps(velecsum,velec);
348
349             fscal            = felec;
350
351              /* Update vectorial force */
352             fix0             = _mm_macc_ps(dx01,fscal,fix0);
353             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
354             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
355
356             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
357             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
358             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
359
360             /**************************
361              * CALCULATE INTERACTIONS *
362              **************************/
363
364             r02              = _mm_mul_ps(rsq02,rinv02);
365
366             /* Calculate table index by multiplying r with table scale and truncate to integer */
367             rt               = _mm_mul_ps(r02,vftabscale);
368             vfitab           = _mm_cvttps_epi32(rt);
369 #ifdef __XOP__
370             vfeps            = _mm_frcz_ps(rt);
371 #else
372             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
373 #endif
374             twovfeps         = _mm_add_ps(vfeps,vfeps);
375             vfitab           = _mm_slli_epi32(vfitab,2);
376
377             /* CUBIC SPLINE TABLE ELECTROSTATICS */
378             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
379             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
380             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
381             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
382             _MM_TRANSPOSE4_PS(Y,F,G,H);
383             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
384             VV               = _mm_macc_ps(vfeps,Fp,Y);
385             velec            = _mm_mul_ps(qq02,VV);
386             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
387             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
388
389             /* Update potential sum for this i atom from the interaction with this j atom. */
390             velecsum         = _mm_add_ps(velecsum,velec);
391
392             fscal            = felec;
393
394              /* Update vectorial force */
395             fix0             = _mm_macc_ps(dx02,fscal,fix0);
396             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
397             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
398
399             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
400             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
401             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
402
403             /**************************
404              * CALCULATE INTERACTIONS *
405              **************************/
406
407             r10              = _mm_mul_ps(rsq10,rinv10);
408
409             /* Calculate table index by multiplying r with table scale and truncate to integer */
410             rt               = _mm_mul_ps(r10,vftabscale);
411             vfitab           = _mm_cvttps_epi32(rt);
412 #ifdef __XOP__
413             vfeps            = _mm_frcz_ps(rt);
414 #else
415             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
416 #endif
417             twovfeps         = _mm_add_ps(vfeps,vfeps);
418             vfitab           = _mm_slli_epi32(vfitab,2);
419
420             /* CUBIC SPLINE TABLE ELECTROSTATICS */
421             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
422             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
423             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
424             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
425             _MM_TRANSPOSE4_PS(Y,F,G,H);
426             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
427             VV               = _mm_macc_ps(vfeps,Fp,Y);
428             velec            = _mm_mul_ps(qq10,VV);
429             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
430             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
431
432             /* Update potential sum for this i atom from the interaction with this j atom. */
433             velecsum         = _mm_add_ps(velecsum,velec);
434
435             fscal            = felec;
436
437              /* Update vectorial force */
438             fix1             = _mm_macc_ps(dx10,fscal,fix1);
439             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
440             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
441
442             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
443             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
444             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
445
446             /**************************
447              * CALCULATE INTERACTIONS *
448              **************************/
449
450             r11              = _mm_mul_ps(rsq11,rinv11);
451
452             /* Calculate table index by multiplying r with table scale and truncate to integer */
453             rt               = _mm_mul_ps(r11,vftabscale);
454             vfitab           = _mm_cvttps_epi32(rt);
455 #ifdef __XOP__
456             vfeps            = _mm_frcz_ps(rt);
457 #else
458             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
459 #endif
460             twovfeps         = _mm_add_ps(vfeps,vfeps);
461             vfitab           = _mm_slli_epi32(vfitab,2);
462
463             /* CUBIC SPLINE TABLE ELECTROSTATICS */
464             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
465             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
466             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
467             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
468             _MM_TRANSPOSE4_PS(Y,F,G,H);
469             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
470             VV               = _mm_macc_ps(vfeps,Fp,Y);
471             velec            = _mm_mul_ps(qq11,VV);
472             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
473             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
474
475             /* Update potential sum for this i atom from the interaction with this j atom. */
476             velecsum         = _mm_add_ps(velecsum,velec);
477
478             fscal            = felec;
479
480              /* Update vectorial force */
481             fix1             = _mm_macc_ps(dx11,fscal,fix1);
482             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
483             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
484
485             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
486             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
487             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
488
489             /**************************
490              * CALCULATE INTERACTIONS *
491              **************************/
492
493             r12              = _mm_mul_ps(rsq12,rinv12);
494
495             /* Calculate table index by multiplying r with table scale and truncate to integer */
496             rt               = _mm_mul_ps(r12,vftabscale);
497             vfitab           = _mm_cvttps_epi32(rt);
498 #ifdef __XOP__
499             vfeps            = _mm_frcz_ps(rt);
500 #else
501             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
502 #endif
503             twovfeps         = _mm_add_ps(vfeps,vfeps);
504             vfitab           = _mm_slli_epi32(vfitab,2);
505
506             /* CUBIC SPLINE TABLE ELECTROSTATICS */
507             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
508             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
509             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
510             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
511             _MM_TRANSPOSE4_PS(Y,F,G,H);
512             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
513             VV               = _mm_macc_ps(vfeps,Fp,Y);
514             velec            = _mm_mul_ps(qq12,VV);
515             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
516             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
517
518             /* Update potential sum for this i atom from the interaction with this j atom. */
519             velecsum         = _mm_add_ps(velecsum,velec);
520
521             fscal            = felec;
522
523              /* Update vectorial force */
524             fix1             = _mm_macc_ps(dx12,fscal,fix1);
525             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
526             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
527
528             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
529             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
530             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
531
532             /**************************
533              * CALCULATE INTERACTIONS *
534              **************************/
535
536             r20              = _mm_mul_ps(rsq20,rinv20);
537
538             /* Calculate table index by multiplying r with table scale and truncate to integer */
539             rt               = _mm_mul_ps(r20,vftabscale);
540             vfitab           = _mm_cvttps_epi32(rt);
541 #ifdef __XOP__
542             vfeps            = _mm_frcz_ps(rt);
543 #else
544             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
545 #endif
546             twovfeps         = _mm_add_ps(vfeps,vfeps);
547             vfitab           = _mm_slli_epi32(vfitab,2);
548
549             /* CUBIC SPLINE TABLE ELECTROSTATICS */
550             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
551             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
552             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
553             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
554             _MM_TRANSPOSE4_PS(Y,F,G,H);
555             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
556             VV               = _mm_macc_ps(vfeps,Fp,Y);
557             velec            = _mm_mul_ps(qq20,VV);
558             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
559             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
560
561             /* Update potential sum for this i atom from the interaction with this j atom. */
562             velecsum         = _mm_add_ps(velecsum,velec);
563
564             fscal            = felec;
565
566              /* Update vectorial force */
567             fix2             = _mm_macc_ps(dx20,fscal,fix2);
568             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
569             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
570
571             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
572             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
573             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
574
575             /**************************
576              * CALCULATE INTERACTIONS *
577              **************************/
578
579             r21              = _mm_mul_ps(rsq21,rinv21);
580
581             /* Calculate table index by multiplying r with table scale and truncate to integer */
582             rt               = _mm_mul_ps(r21,vftabscale);
583             vfitab           = _mm_cvttps_epi32(rt);
584 #ifdef __XOP__
585             vfeps            = _mm_frcz_ps(rt);
586 #else
587             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
588 #endif
589             twovfeps         = _mm_add_ps(vfeps,vfeps);
590             vfitab           = _mm_slli_epi32(vfitab,2);
591
592             /* CUBIC SPLINE TABLE ELECTROSTATICS */
593             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
594             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
595             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
596             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
597             _MM_TRANSPOSE4_PS(Y,F,G,H);
598             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
599             VV               = _mm_macc_ps(vfeps,Fp,Y);
600             velec            = _mm_mul_ps(qq21,VV);
601             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
602             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
603
604             /* Update potential sum for this i atom from the interaction with this j atom. */
605             velecsum         = _mm_add_ps(velecsum,velec);
606
607             fscal            = felec;
608
609              /* Update vectorial force */
610             fix2             = _mm_macc_ps(dx21,fscal,fix2);
611             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
612             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
613
614             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
615             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
616             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
617
618             /**************************
619              * CALCULATE INTERACTIONS *
620              **************************/
621
622             r22              = _mm_mul_ps(rsq22,rinv22);
623
624             /* Calculate table index by multiplying r with table scale and truncate to integer */
625             rt               = _mm_mul_ps(r22,vftabscale);
626             vfitab           = _mm_cvttps_epi32(rt);
627 #ifdef __XOP__
628             vfeps            = _mm_frcz_ps(rt);
629 #else
630             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
631 #endif
632             twovfeps         = _mm_add_ps(vfeps,vfeps);
633             vfitab           = _mm_slli_epi32(vfitab,2);
634
635             /* CUBIC SPLINE TABLE ELECTROSTATICS */
636             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
637             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
638             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
639             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
640             _MM_TRANSPOSE4_PS(Y,F,G,H);
641             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
642             VV               = _mm_macc_ps(vfeps,Fp,Y);
643             velec            = _mm_mul_ps(qq22,VV);
644             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
645             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
646
647             /* Update potential sum for this i atom from the interaction with this j atom. */
648             velecsum         = _mm_add_ps(velecsum,velec);
649
650             fscal            = felec;
651
652              /* Update vectorial force */
653             fix2             = _mm_macc_ps(dx22,fscal,fix2);
654             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
655             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
656
657             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
658             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
659             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
660
661             fjptrA             = f+j_coord_offsetA;
662             fjptrB             = f+j_coord_offsetB;
663             fjptrC             = f+j_coord_offsetC;
664             fjptrD             = f+j_coord_offsetD;
665
666             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
667                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
668
669             /* Inner loop uses 414 flops */
670         }
671
672         if(jidx<j_index_end)
673         {
674
675             /* Get j neighbor index, and coordinate index */
676             jnrlistA         = jjnr[jidx];
677             jnrlistB         = jjnr[jidx+1];
678             jnrlistC         = jjnr[jidx+2];
679             jnrlistD         = jjnr[jidx+3];
680             /* Sign of each element will be negative for non-real atoms.
681              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
682              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
683              */
684             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
685             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
686             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
687             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
688             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
689             j_coord_offsetA  = DIM*jnrA;
690             j_coord_offsetB  = DIM*jnrB;
691             j_coord_offsetC  = DIM*jnrC;
692             j_coord_offsetD  = DIM*jnrD;
693
694             /* load j atom coordinates */
695             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
696                                               x+j_coord_offsetC,x+j_coord_offsetD,
697                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
698
699             /* Calculate displacement vector */
700             dx00             = _mm_sub_ps(ix0,jx0);
701             dy00             = _mm_sub_ps(iy0,jy0);
702             dz00             = _mm_sub_ps(iz0,jz0);
703             dx01             = _mm_sub_ps(ix0,jx1);
704             dy01             = _mm_sub_ps(iy0,jy1);
705             dz01             = _mm_sub_ps(iz0,jz1);
706             dx02             = _mm_sub_ps(ix0,jx2);
707             dy02             = _mm_sub_ps(iy0,jy2);
708             dz02             = _mm_sub_ps(iz0,jz2);
709             dx10             = _mm_sub_ps(ix1,jx0);
710             dy10             = _mm_sub_ps(iy1,jy0);
711             dz10             = _mm_sub_ps(iz1,jz0);
712             dx11             = _mm_sub_ps(ix1,jx1);
713             dy11             = _mm_sub_ps(iy1,jy1);
714             dz11             = _mm_sub_ps(iz1,jz1);
715             dx12             = _mm_sub_ps(ix1,jx2);
716             dy12             = _mm_sub_ps(iy1,jy2);
717             dz12             = _mm_sub_ps(iz1,jz2);
718             dx20             = _mm_sub_ps(ix2,jx0);
719             dy20             = _mm_sub_ps(iy2,jy0);
720             dz20             = _mm_sub_ps(iz2,jz0);
721             dx21             = _mm_sub_ps(ix2,jx1);
722             dy21             = _mm_sub_ps(iy2,jy1);
723             dz21             = _mm_sub_ps(iz2,jz1);
724             dx22             = _mm_sub_ps(ix2,jx2);
725             dy22             = _mm_sub_ps(iy2,jy2);
726             dz22             = _mm_sub_ps(iz2,jz2);
727
728             /* Calculate squared distance and things based on it */
729             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
730             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
731             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
732             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
733             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
734             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
735             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
736             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
737             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
738
739             rinv00           = avx128fma_invsqrt_f(rsq00);
740             rinv01           = avx128fma_invsqrt_f(rsq01);
741             rinv02           = avx128fma_invsqrt_f(rsq02);
742             rinv10           = avx128fma_invsqrt_f(rsq10);
743             rinv11           = avx128fma_invsqrt_f(rsq11);
744             rinv12           = avx128fma_invsqrt_f(rsq12);
745             rinv20           = avx128fma_invsqrt_f(rsq20);
746             rinv21           = avx128fma_invsqrt_f(rsq21);
747             rinv22           = avx128fma_invsqrt_f(rsq22);
748
749             fjx0             = _mm_setzero_ps();
750             fjy0             = _mm_setzero_ps();
751             fjz0             = _mm_setzero_ps();
752             fjx1             = _mm_setzero_ps();
753             fjy1             = _mm_setzero_ps();
754             fjz1             = _mm_setzero_ps();
755             fjx2             = _mm_setzero_ps();
756             fjy2             = _mm_setzero_ps();
757             fjz2             = _mm_setzero_ps();
758
759             /**************************
760              * CALCULATE INTERACTIONS *
761              **************************/
762
763             r00              = _mm_mul_ps(rsq00,rinv00);
764             r00              = _mm_andnot_ps(dummy_mask,r00);
765
766             /* Calculate table index by multiplying r with table scale and truncate to integer */
767             rt               = _mm_mul_ps(r00,vftabscale);
768             vfitab           = _mm_cvttps_epi32(rt);
769 #ifdef __XOP__
770             vfeps            = _mm_frcz_ps(rt);
771 #else
772             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
773 #endif
774             twovfeps         = _mm_add_ps(vfeps,vfeps);
775             vfitab           = _mm_slli_epi32(vfitab,2);
776
777             /* CUBIC SPLINE TABLE ELECTROSTATICS */
778             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
779             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
780             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
781             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
782             _MM_TRANSPOSE4_PS(Y,F,G,H);
783             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
784             VV               = _mm_macc_ps(vfeps,Fp,Y);
785             velec            = _mm_mul_ps(qq00,VV);
786             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
787             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
788
789             /* Update potential sum for this i atom from the interaction with this j atom. */
790             velec            = _mm_andnot_ps(dummy_mask,velec);
791             velecsum         = _mm_add_ps(velecsum,velec);
792
793             fscal            = felec;
794
795             fscal            = _mm_andnot_ps(dummy_mask,fscal);
796
797              /* Update vectorial force */
798             fix0             = _mm_macc_ps(dx00,fscal,fix0);
799             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
800             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
801
802             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
803             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
804             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
805
806             /**************************
807              * CALCULATE INTERACTIONS *
808              **************************/
809
810             r01              = _mm_mul_ps(rsq01,rinv01);
811             r01              = _mm_andnot_ps(dummy_mask,r01);
812
813             /* Calculate table index by multiplying r with table scale and truncate to integer */
814             rt               = _mm_mul_ps(r01,vftabscale);
815             vfitab           = _mm_cvttps_epi32(rt);
816 #ifdef __XOP__
817             vfeps            = _mm_frcz_ps(rt);
818 #else
819             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
820 #endif
821             twovfeps         = _mm_add_ps(vfeps,vfeps);
822             vfitab           = _mm_slli_epi32(vfitab,2);
823
824             /* CUBIC SPLINE TABLE ELECTROSTATICS */
825             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
826             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
827             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
828             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
829             _MM_TRANSPOSE4_PS(Y,F,G,H);
830             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
831             VV               = _mm_macc_ps(vfeps,Fp,Y);
832             velec            = _mm_mul_ps(qq01,VV);
833             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
834             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
835
836             /* Update potential sum for this i atom from the interaction with this j atom. */
837             velec            = _mm_andnot_ps(dummy_mask,velec);
838             velecsum         = _mm_add_ps(velecsum,velec);
839
840             fscal            = felec;
841
842             fscal            = _mm_andnot_ps(dummy_mask,fscal);
843
844              /* Update vectorial force */
845             fix0             = _mm_macc_ps(dx01,fscal,fix0);
846             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
847             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
848
849             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
850             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
851             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
852
853             /**************************
854              * CALCULATE INTERACTIONS *
855              **************************/
856
857             r02              = _mm_mul_ps(rsq02,rinv02);
858             r02              = _mm_andnot_ps(dummy_mask,r02);
859
860             /* Calculate table index by multiplying r with table scale and truncate to integer */
861             rt               = _mm_mul_ps(r02,vftabscale);
862             vfitab           = _mm_cvttps_epi32(rt);
863 #ifdef __XOP__
864             vfeps            = _mm_frcz_ps(rt);
865 #else
866             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
867 #endif
868             twovfeps         = _mm_add_ps(vfeps,vfeps);
869             vfitab           = _mm_slli_epi32(vfitab,2);
870
871             /* CUBIC SPLINE TABLE ELECTROSTATICS */
872             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
873             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
874             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
875             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
876             _MM_TRANSPOSE4_PS(Y,F,G,H);
877             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
878             VV               = _mm_macc_ps(vfeps,Fp,Y);
879             velec            = _mm_mul_ps(qq02,VV);
880             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
881             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
882
883             /* Update potential sum for this i atom from the interaction with this j atom. */
884             velec            = _mm_andnot_ps(dummy_mask,velec);
885             velecsum         = _mm_add_ps(velecsum,velec);
886
887             fscal            = felec;
888
889             fscal            = _mm_andnot_ps(dummy_mask,fscal);
890
891              /* Update vectorial force */
892             fix0             = _mm_macc_ps(dx02,fscal,fix0);
893             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
894             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
895
896             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
897             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
898             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
899
900             /**************************
901              * CALCULATE INTERACTIONS *
902              **************************/
903
904             r10              = _mm_mul_ps(rsq10,rinv10);
905             r10              = _mm_andnot_ps(dummy_mask,r10);
906
907             /* Calculate table index by multiplying r with table scale and truncate to integer */
908             rt               = _mm_mul_ps(r10,vftabscale);
909             vfitab           = _mm_cvttps_epi32(rt);
910 #ifdef __XOP__
911             vfeps            = _mm_frcz_ps(rt);
912 #else
913             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
914 #endif
915             twovfeps         = _mm_add_ps(vfeps,vfeps);
916             vfitab           = _mm_slli_epi32(vfitab,2);
917
918             /* CUBIC SPLINE TABLE ELECTROSTATICS */
919             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
920             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
921             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
922             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
923             _MM_TRANSPOSE4_PS(Y,F,G,H);
924             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
925             VV               = _mm_macc_ps(vfeps,Fp,Y);
926             velec            = _mm_mul_ps(qq10,VV);
927             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
928             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
929
930             /* Update potential sum for this i atom from the interaction with this j atom. */
931             velec            = _mm_andnot_ps(dummy_mask,velec);
932             velecsum         = _mm_add_ps(velecsum,velec);
933
934             fscal            = felec;
935
936             fscal            = _mm_andnot_ps(dummy_mask,fscal);
937
938              /* Update vectorial force */
939             fix1             = _mm_macc_ps(dx10,fscal,fix1);
940             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
941             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
942
943             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
944             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
945             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
946
947             /**************************
948              * CALCULATE INTERACTIONS *
949              **************************/
950
951             r11              = _mm_mul_ps(rsq11,rinv11);
952             r11              = _mm_andnot_ps(dummy_mask,r11);
953
954             /* Calculate table index by multiplying r with table scale and truncate to integer */
955             rt               = _mm_mul_ps(r11,vftabscale);
956             vfitab           = _mm_cvttps_epi32(rt);
957 #ifdef __XOP__
958             vfeps            = _mm_frcz_ps(rt);
959 #else
960             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
961 #endif
962             twovfeps         = _mm_add_ps(vfeps,vfeps);
963             vfitab           = _mm_slli_epi32(vfitab,2);
964
965             /* CUBIC SPLINE TABLE ELECTROSTATICS */
966             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
967             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
968             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
969             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
970             _MM_TRANSPOSE4_PS(Y,F,G,H);
971             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
972             VV               = _mm_macc_ps(vfeps,Fp,Y);
973             velec            = _mm_mul_ps(qq11,VV);
974             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
975             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
976
977             /* Update potential sum for this i atom from the interaction with this j atom. */
978             velec            = _mm_andnot_ps(dummy_mask,velec);
979             velecsum         = _mm_add_ps(velecsum,velec);
980
981             fscal            = felec;
982
983             fscal            = _mm_andnot_ps(dummy_mask,fscal);
984
985              /* Update vectorial force */
986             fix1             = _mm_macc_ps(dx11,fscal,fix1);
987             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
988             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
989
990             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
991             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
992             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
993
994             /**************************
995              * CALCULATE INTERACTIONS *
996              **************************/
997
998             r12              = _mm_mul_ps(rsq12,rinv12);
999             r12              = _mm_andnot_ps(dummy_mask,r12);
1000
1001             /* Calculate table index by multiplying r with table scale and truncate to integer */
1002             rt               = _mm_mul_ps(r12,vftabscale);
1003             vfitab           = _mm_cvttps_epi32(rt);
1004 #ifdef __XOP__
1005             vfeps            = _mm_frcz_ps(rt);
1006 #else
1007             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1008 #endif
1009             twovfeps         = _mm_add_ps(vfeps,vfeps);
1010             vfitab           = _mm_slli_epi32(vfitab,2);
1011
1012             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1013             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1014             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1015             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1016             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1017             _MM_TRANSPOSE4_PS(Y,F,G,H);
1018             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1019             VV               = _mm_macc_ps(vfeps,Fp,Y);
1020             velec            = _mm_mul_ps(qq12,VV);
1021             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1022             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1023
1024             /* Update potential sum for this i atom from the interaction with this j atom. */
1025             velec            = _mm_andnot_ps(dummy_mask,velec);
1026             velecsum         = _mm_add_ps(velecsum,velec);
1027
1028             fscal            = felec;
1029
1030             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1031
1032              /* Update vectorial force */
1033             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1034             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1035             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1036
1037             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1038             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1039             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1040
1041             /**************************
1042              * CALCULATE INTERACTIONS *
1043              **************************/
1044
1045             r20              = _mm_mul_ps(rsq20,rinv20);
1046             r20              = _mm_andnot_ps(dummy_mask,r20);
1047
1048             /* Calculate table index by multiplying r with table scale and truncate to integer */
1049             rt               = _mm_mul_ps(r20,vftabscale);
1050             vfitab           = _mm_cvttps_epi32(rt);
1051 #ifdef __XOP__
1052             vfeps            = _mm_frcz_ps(rt);
1053 #else
1054             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1055 #endif
1056             twovfeps         = _mm_add_ps(vfeps,vfeps);
1057             vfitab           = _mm_slli_epi32(vfitab,2);
1058
1059             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1060             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1061             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1062             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1063             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1064             _MM_TRANSPOSE4_PS(Y,F,G,H);
1065             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1066             VV               = _mm_macc_ps(vfeps,Fp,Y);
1067             velec            = _mm_mul_ps(qq20,VV);
1068             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1069             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1070
1071             /* Update potential sum for this i atom from the interaction with this j atom. */
1072             velec            = _mm_andnot_ps(dummy_mask,velec);
1073             velecsum         = _mm_add_ps(velecsum,velec);
1074
1075             fscal            = felec;
1076
1077             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1078
1079              /* Update vectorial force */
1080             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1081             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1082             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1083
1084             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1085             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1086             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1087
1088             /**************************
1089              * CALCULATE INTERACTIONS *
1090              **************************/
1091
1092             r21              = _mm_mul_ps(rsq21,rinv21);
1093             r21              = _mm_andnot_ps(dummy_mask,r21);
1094
1095             /* Calculate table index by multiplying r with table scale and truncate to integer */
1096             rt               = _mm_mul_ps(r21,vftabscale);
1097             vfitab           = _mm_cvttps_epi32(rt);
1098 #ifdef __XOP__
1099             vfeps            = _mm_frcz_ps(rt);
1100 #else
1101             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1102 #endif
1103             twovfeps         = _mm_add_ps(vfeps,vfeps);
1104             vfitab           = _mm_slli_epi32(vfitab,2);
1105
1106             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1107             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1108             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1109             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1110             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1111             _MM_TRANSPOSE4_PS(Y,F,G,H);
1112             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1113             VV               = _mm_macc_ps(vfeps,Fp,Y);
1114             velec            = _mm_mul_ps(qq21,VV);
1115             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1116             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1117
1118             /* Update potential sum for this i atom from the interaction with this j atom. */
1119             velec            = _mm_andnot_ps(dummy_mask,velec);
1120             velecsum         = _mm_add_ps(velecsum,velec);
1121
1122             fscal            = felec;
1123
1124             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1125
1126              /* Update vectorial force */
1127             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1128             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1129             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1130
1131             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1132             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1133             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1134
1135             /**************************
1136              * CALCULATE INTERACTIONS *
1137              **************************/
1138
1139             r22              = _mm_mul_ps(rsq22,rinv22);
1140             r22              = _mm_andnot_ps(dummy_mask,r22);
1141
1142             /* Calculate table index by multiplying r with table scale and truncate to integer */
1143             rt               = _mm_mul_ps(r22,vftabscale);
1144             vfitab           = _mm_cvttps_epi32(rt);
1145 #ifdef __XOP__
1146             vfeps            = _mm_frcz_ps(rt);
1147 #else
1148             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1149 #endif
1150             twovfeps         = _mm_add_ps(vfeps,vfeps);
1151             vfitab           = _mm_slli_epi32(vfitab,2);
1152
1153             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1154             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1155             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1156             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1157             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1158             _MM_TRANSPOSE4_PS(Y,F,G,H);
1159             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1160             VV               = _mm_macc_ps(vfeps,Fp,Y);
1161             velec            = _mm_mul_ps(qq22,VV);
1162             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1163             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1164
1165             /* Update potential sum for this i atom from the interaction with this j atom. */
1166             velec            = _mm_andnot_ps(dummy_mask,velec);
1167             velecsum         = _mm_add_ps(velecsum,velec);
1168
1169             fscal            = felec;
1170
1171             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1172
1173              /* Update vectorial force */
1174             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1175             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1176             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1177
1178             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1179             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1180             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1181
1182             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1183             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1184             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1185             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1186
1187             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1188                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1189
1190             /* Inner loop uses 423 flops */
1191         }
1192
1193         /* End of innermost loop */
1194
1195         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1196                                               f+i_coord_offset,fshift+i_shift_offset);
1197
1198         ggid                        = gid[iidx];
1199         /* Update potential energies */
1200         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1201
1202         /* Increment number of inner iterations */
1203         inneriter                  += j_index_end - j_index_start;
1204
1205         /* Outer loop uses 19 flops */
1206     }
1207
1208     /* Increment number of outer iterations */
1209     outeriter        += nri;
1210
1211     /* Update outer/inner flops */
1212
1213     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*423);
1214 }
1215 /*
1216  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_128_fma_single
1217  * Electrostatics interaction: CubicSplineTable
1218  * VdW interaction:            None
1219  * Geometry:                   Water3-Water3
1220  * Calculate force/pot:        Force
1221  */
1222 void
1223 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_128_fma_single
1224                     (t_nblist                    * gmx_restrict       nlist,
1225                      rvec                        * gmx_restrict          xx,
1226                      rvec                        * gmx_restrict          ff,
1227                      struct t_forcerec           * gmx_restrict          fr,
1228                      t_mdatoms                   * gmx_restrict     mdatoms,
1229                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1230                      t_nrnb                      * gmx_restrict        nrnb)
1231 {
1232     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1233      * just 0 for non-waters.
1234      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1235      * jnr indices corresponding to data put in the four positions in the SIMD register.
1236      */
1237     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1238     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1239     int              jnrA,jnrB,jnrC,jnrD;
1240     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1241     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1242     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1243     real             rcutoff_scalar;
1244     real             *shiftvec,*fshift,*x,*f;
1245     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1246     real             scratch[4*DIM];
1247     __m128           fscal,rcutoff,rcutoff2,jidxall;
1248     int              vdwioffset0;
1249     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1250     int              vdwioffset1;
1251     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1252     int              vdwioffset2;
1253     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1254     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1255     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1256     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1257     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1258     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1259     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1260     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1261     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1262     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1263     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1264     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1265     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1266     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1267     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1268     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1269     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1270     real             *charge;
1271     __m128i          vfitab;
1272     __m128i          ifour       = _mm_set1_epi32(4);
1273     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1274     real             *vftab;
1275     __m128           dummy_mask,cutoff_mask;
1276     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1277     __m128           one     = _mm_set1_ps(1.0);
1278     __m128           two     = _mm_set1_ps(2.0);
1279     x                = xx[0];
1280     f                = ff[0];
1281
1282     nri              = nlist->nri;
1283     iinr             = nlist->iinr;
1284     jindex           = nlist->jindex;
1285     jjnr             = nlist->jjnr;
1286     shiftidx         = nlist->shift;
1287     gid              = nlist->gid;
1288     shiftvec         = fr->shift_vec[0];
1289     fshift           = fr->fshift[0];
1290     facel            = _mm_set1_ps(fr->ic->epsfac);
1291     charge           = mdatoms->chargeA;
1292
1293     vftab            = kernel_data->table_elec->data;
1294     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
1295
1296     /* Setup water-specific parameters */
1297     inr              = nlist->iinr[0];
1298     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1299     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1300     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1301
1302     jq0              = _mm_set1_ps(charge[inr+0]);
1303     jq1              = _mm_set1_ps(charge[inr+1]);
1304     jq2              = _mm_set1_ps(charge[inr+2]);
1305     qq00             = _mm_mul_ps(iq0,jq0);
1306     qq01             = _mm_mul_ps(iq0,jq1);
1307     qq02             = _mm_mul_ps(iq0,jq2);
1308     qq10             = _mm_mul_ps(iq1,jq0);
1309     qq11             = _mm_mul_ps(iq1,jq1);
1310     qq12             = _mm_mul_ps(iq1,jq2);
1311     qq20             = _mm_mul_ps(iq2,jq0);
1312     qq21             = _mm_mul_ps(iq2,jq1);
1313     qq22             = _mm_mul_ps(iq2,jq2);
1314
1315     /* Avoid stupid compiler warnings */
1316     jnrA = jnrB = jnrC = jnrD = 0;
1317     j_coord_offsetA = 0;
1318     j_coord_offsetB = 0;
1319     j_coord_offsetC = 0;
1320     j_coord_offsetD = 0;
1321
1322     outeriter        = 0;
1323     inneriter        = 0;
1324
1325     for(iidx=0;iidx<4*DIM;iidx++)
1326     {
1327         scratch[iidx] = 0.0;
1328     }
1329
1330     /* Start outer loop over neighborlists */
1331     for(iidx=0; iidx<nri; iidx++)
1332     {
1333         /* Load shift vector for this list */
1334         i_shift_offset   = DIM*shiftidx[iidx];
1335
1336         /* Load limits for loop over neighbors */
1337         j_index_start    = jindex[iidx];
1338         j_index_end      = jindex[iidx+1];
1339
1340         /* Get outer coordinate index */
1341         inr              = iinr[iidx];
1342         i_coord_offset   = DIM*inr;
1343
1344         /* Load i particle coords and add shift vector */
1345         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1346                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1347
1348         fix0             = _mm_setzero_ps();
1349         fiy0             = _mm_setzero_ps();
1350         fiz0             = _mm_setzero_ps();
1351         fix1             = _mm_setzero_ps();
1352         fiy1             = _mm_setzero_ps();
1353         fiz1             = _mm_setzero_ps();
1354         fix2             = _mm_setzero_ps();
1355         fiy2             = _mm_setzero_ps();
1356         fiz2             = _mm_setzero_ps();
1357
1358         /* Start inner kernel loop */
1359         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1360         {
1361
1362             /* Get j neighbor index, and coordinate index */
1363             jnrA             = jjnr[jidx];
1364             jnrB             = jjnr[jidx+1];
1365             jnrC             = jjnr[jidx+2];
1366             jnrD             = jjnr[jidx+3];
1367             j_coord_offsetA  = DIM*jnrA;
1368             j_coord_offsetB  = DIM*jnrB;
1369             j_coord_offsetC  = DIM*jnrC;
1370             j_coord_offsetD  = DIM*jnrD;
1371
1372             /* load j atom coordinates */
1373             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1374                                               x+j_coord_offsetC,x+j_coord_offsetD,
1375                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1376
1377             /* Calculate displacement vector */
1378             dx00             = _mm_sub_ps(ix0,jx0);
1379             dy00             = _mm_sub_ps(iy0,jy0);
1380             dz00             = _mm_sub_ps(iz0,jz0);
1381             dx01             = _mm_sub_ps(ix0,jx1);
1382             dy01             = _mm_sub_ps(iy0,jy1);
1383             dz01             = _mm_sub_ps(iz0,jz1);
1384             dx02             = _mm_sub_ps(ix0,jx2);
1385             dy02             = _mm_sub_ps(iy0,jy2);
1386             dz02             = _mm_sub_ps(iz0,jz2);
1387             dx10             = _mm_sub_ps(ix1,jx0);
1388             dy10             = _mm_sub_ps(iy1,jy0);
1389             dz10             = _mm_sub_ps(iz1,jz0);
1390             dx11             = _mm_sub_ps(ix1,jx1);
1391             dy11             = _mm_sub_ps(iy1,jy1);
1392             dz11             = _mm_sub_ps(iz1,jz1);
1393             dx12             = _mm_sub_ps(ix1,jx2);
1394             dy12             = _mm_sub_ps(iy1,jy2);
1395             dz12             = _mm_sub_ps(iz1,jz2);
1396             dx20             = _mm_sub_ps(ix2,jx0);
1397             dy20             = _mm_sub_ps(iy2,jy0);
1398             dz20             = _mm_sub_ps(iz2,jz0);
1399             dx21             = _mm_sub_ps(ix2,jx1);
1400             dy21             = _mm_sub_ps(iy2,jy1);
1401             dz21             = _mm_sub_ps(iz2,jz1);
1402             dx22             = _mm_sub_ps(ix2,jx2);
1403             dy22             = _mm_sub_ps(iy2,jy2);
1404             dz22             = _mm_sub_ps(iz2,jz2);
1405
1406             /* Calculate squared distance and things based on it */
1407             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1408             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1409             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1410             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1411             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1412             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1413             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1414             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1415             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1416
1417             rinv00           = avx128fma_invsqrt_f(rsq00);
1418             rinv01           = avx128fma_invsqrt_f(rsq01);
1419             rinv02           = avx128fma_invsqrt_f(rsq02);
1420             rinv10           = avx128fma_invsqrt_f(rsq10);
1421             rinv11           = avx128fma_invsqrt_f(rsq11);
1422             rinv12           = avx128fma_invsqrt_f(rsq12);
1423             rinv20           = avx128fma_invsqrt_f(rsq20);
1424             rinv21           = avx128fma_invsqrt_f(rsq21);
1425             rinv22           = avx128fma_invsqrt_f(rsq22);
1426
1427             fjx0             = _mm_setzero_ps();
1428             fjy0             = _mm_setzero_ps();
1429             fjz0             = _mm_setzero_ps();
1430             fjx1             = _mm_setzero_ps();
1431             fjy1             = _mm_setzero_ps();
1432             fjz1             = _mm_setzero_ps();
1433             fjx2             = _mm_setzero_ps();
1434             fjy2             = _mm_setzero_ps();
1435             fjz2             = _mm_setzero_ps();
1436
1437             /**************************
1438              * CALCULATE INTERACTIONS *
1439              **************************/
1440
1441             r00              = _mm_mul_ps(rsq00,rinv00);
1442
1443             /* Calculate table index by multiplying r with table scale and truncate to integer */
1444             rt               = _mm_mul_ps(r00,vftabscale);
1445             vfitab           = _mm_cvttps_epi32(rt);
1446 #ifdef __XOP__
1447             vfeps            = _mm_frcz_ps(rt);
1448 #else
1449             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1450 #endif
1451             twovfeps         = _mm_add_ps(vfeps,vfeps);
1452             vfitab           = _mm_slli_epi32(vfitab,2);
1453
1454             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1455             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1456             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1457             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1458             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1459             _MM_TRANSPOSE4_PS(Y,F,G,H);
1460             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1461             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1462             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1463
1464             fscal            = felec;
1465
1466              /* Update vectorial force */
1467             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1468             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1469             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1470
1471             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1472             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1473             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1474
1475             /**************************
1476              * CALCULATE INTERACTIONS *
1477              **************************/
1478
1479             r01              = _mm_mul_ps(rsq01,rinv01);
1480
1481             /* Calculate table index by multiplying r with table scale and truncate to integer */
1482             rt               = _mm_mul_ps(r01,vftabscale);
1483             vfitab           = _mm_cvttps_epi32(rt);
1484 #ifdef __XOP__
1485             vfeps            = _mm_frcz_ps(rt);
1486 #else
1487             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1488 #endif
1489             twovfeps         = _mm_add_ps(vfeps,vfeps);
1490             vfitab           = _mm_slli_epi32(vfitab,2);
1491
1492             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1493             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1494             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1495             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1496             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1497             _MM_TRANSPOSE4_PS(Y,F,G,H);
1498             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1499             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1500             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1501
1502             fscal            = felec;
1503
1504              /* Update vectorial force */
1505             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1506             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1507             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1508
1509             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1510             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1511             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1512
1513             /**************************
1514              * CALCULATE INTERACTIONS *
1515              **************************/
1516
1517             r02              = _mm_mul_ps(rsq02,rinv02);
1518
1519             /* Calculate table index by multiplying r with table scale and truncate to integer */
1520             rt               = _mm_mul_ps(r02,vftabscale);
1521             vfitab           = _mm_cvttps_epi32(rt);
1522 #ifdef __XOP__
1523             vfeps            = _mm_frcz_ps(rt);
1524 #else
1525             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1526 #endif
1527             twovfeps         = _mm_add_ps(vfeps,vfeps);
1528             vfitab           = _mm_slli_epi32(vfitab,2);
1529
1530             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1531             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1532             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1533             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1534             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1535             _MM_TRANSPOSE4_PS(Y,F,G,H);
1536             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1537             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1538             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1539
1540             fscal            = felec;
1541
1542              /* Update vectorial force */
1543             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1544             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1545             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1546
1547             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1548             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1549             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1550
1551             /**************************
1552              * CALCULATE INTERACTIONS *
1553              **************************/
1554
1555             r10              = _mm_mul_ps(rsq10,rinv10);
1556
1557             /* Calculate table index by multiplying r with table scale and truncate to integer */
1558             rt               = _mm_mul_ps(r10,vftabscale);
1559             vfitab           = _mm_cvttps_epi32(rt);
1560 #ifdef __XOP__
1561             vfeps            = _mm_frcz_ps(rt);
1562 #else
1563             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1564 #endif
1565             twovfeps         = _mm_add_ps(vfeps,vfeps);
1566             vfitab           = _mm_slli_epi32(vfitab,2);
1567
1568             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1569             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1570             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1571             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1572             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1573             _MM_TRANSPOSE4_PS(Y,F,G,H);
1574             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1575             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1576             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1577
1578             fscal            = felec;
1579
1580              /* Update vectorial force */
1581             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1582             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1583             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1584
1585             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1586             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1587             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1588
1589             /**************************
1590              * CALCULATE INTERACTIONS *
1591              **************************/
1592
1593             r11              = _mm_mul_ps(rsq11,rinv11);
1594
1595             /* Calculate table index by multiplying r with table scale and truncate to integer */
1596             rt               = _mm_mul_ps(r11,vftabscale);
1597             vfitab           = _mm_cvttps_epi32(rt);
1598 #ifdef __XOP__
1599             vfeps            = _mm_frcz_ps(rt);
1600 #else
1601             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1602 #endif
1603             twovfeps         = _mm_add_ps(vfeps,vfeps);
1604             vfitab           = _mm_slli_epi32(vfitab,2);
1605
1606             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1607             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1608             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1609             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1610             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1611             _MM_TRANSPOSE4_PS(Y,F,G,H);
1612             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1613             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1614             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1615
1616             fscal            = felec;
1617
1618              /* Update vectorial force */
1619             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1620             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1621             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1622
1623             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1624             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1625             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1626
1627             /**************************
1628              * CALCULATE INTERACTIONS *
1629              **************************/
1630
1631             r12              = _mm_mul_ps(rsq12,rinv12);
1632
1633             /* Calculate table index by multiplying r with table scale and truncate to integer */
1634             rt               = _mm_mul_ps(r12,vftabscale);
1635             vfitab           = _mm_cvttps_epi32(rt);
1636 #ifdef __XOP__
1637             vfeps            = _mm_frcz_ps(rt);
1638 #else
1639             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1640 #endif
1641             twovfeps         = _mm_add_ps(vfeps,vfeps);
1642             vfitab           = _mm_slli_epi32(vfitab,2);
1643
1644             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1645             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1646             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1647             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1648             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1649             _MM_TRANSPOSE4_PS(Y,F,G,H);
1650             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1651             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1652             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1653
1654             fscal            = felec;
1655
1656              /* Update vectorial force */
1657             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1658             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1659             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1660
1661             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1662             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1663             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1664
1665             /**************************
1666              * CALCULATE INTERACTIONS *
1667              **************************/
1668
1669             r20              = _mm_mul_ps(rsq20,rinv20);
1670
1671             /* Calculate table index by multiplying r with table scale and truncate to integer */
1672             rt               = _mm_mul_ps(r20,vftabscale);
1673             vfitab           = _mm_cvttps_epi32(rt);
1674 #ifdef __XOP__
1675             vfeps            = _mm_frcz_ps(rt);
1676 #else
1677             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1678 #endif
1679             twovfeps         = _mm_add_ps(vfeps,vfeps);
1680             vfitab           = _mm_slli_epi32(vfitab,2);
1681
1682             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1683             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1684             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1685             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1686             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1687             _MM_TRANSPOSE4_PS(Y,F,G,H);
1688             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1689             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1690             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1691
1692             fscal            = felec;
1693
1694              /* Update vectorial force */
1695             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1696             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1697             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1698
1699             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1700             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1701             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1702
1703             /**************************
1704              * CALCULATE INTERACTIONS *
1705              **************************/
1706
1707             r21              = _mm_mul_ps(rsq21,rinv21);
1708
1709             /* Calculate table index by multiplying r with table scale and truncate to integer */
1710             rt               = _mm_mul_ps(r21,vftabscale);
1711             vfitab           = _mm_cvttps_epi32(rt);
1712 #ifdef __XOP__
1713             vfeps            = _mm_frcz_ps(rt);
1714 #else
1715             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1716 #endif
1717             twovfeps         = _mm_add_ps(vfeps,vfeps);
1718             vfitab           = _mm_slli_epi32(vfitab,2);
1719
1720             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1721             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1722             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1723             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1724             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1725             _MM_TRANSPOSE4_PS(Y,F,G,H);
1726             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1727             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1728             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1729
1730             fscal            = felec;
1731
1732              /* Update vectorial force */
1733             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1734             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1735             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1736
1737             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1738             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1739             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1740
1741             /**************************
1742              * CALCULATE INTERACTIONS *
1743              **************************/
1744
1745             r22              = _mm_mul_ps(rsq22,rinv22);
1746
1747             /* Calculate table index by multiplying r with table scale and truncate to integer */
1748             rt               = _mm_mul_ps(r22,vftabscale);
1749             vfitab           = _mm_cvttps_epi32(rt);
1750 #ifdef __XOP__
1751             vfeps            = _mm_frcz_ps(rt);
1752 #else
1753             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1754 #endif
1755             twovfeps         = _mm_add_ps(vfeps,vfeps);
1756             vfitab           = _mm_slli_epi32(vfitab,2);
1757
1758             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1759             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1760             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1761             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1762             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1763             _MM_TRANSPOSE4_PS(Y,F,G,H);
1764             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1765             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1766             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1767
1768             fscal            = felec;
1769
1770              /* Update vectorial force */
1771             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1772             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1773             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1774
1775             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1776             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1777             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1778
1779             fjptrA             = f+j_coord_offsetA;
1780             fjptrB             = f+j_coord_offsetB;
1781             fjptrC             = f+j_coord_offsetC;
1782             fjptrD             = f+j_coord_offsetD;
1783
1784             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1785                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1786
1787             /* Inner loop uses 378 flops */
1788         }
1789
1790         if(jidx<j_index_end)
1791         {
1792
1793             /* Get j neighbor index, and coordinate index */
1794             jnrlistA         = jjnr[jidx];
1795             jnrlistB         = jjnr[jidx+1];
1796             jnrlistC         = jjnr[jidx+2];
1797             jnrlistD         = jjnr[jidx+3];
1798             /* Sign of each element will be negative for non-real atoms.
1799              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1800              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1801              */
1802             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1803             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1804             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1805             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1806             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1807             j_coord_offsetA  = DIM*jnrA;
1808             j_coord_offsetB  = DIM*jnrB;
1809             j_coord_offsetC  = DIM*jnrC;
1810             j_coord_offsetD  = DIM*jnrD;
1811
1812             /* load j atom coordinates */
1813             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1814                                               x+j_coord_offsetC,x+j_coord_offsetD,
1815                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1816
1817             /* Calculate displacement vector */
1818             dx00             = _mm_sub_ps(ix0,jx0);
1819             dy00             = _mm_sub_ps(iy0,jy0);
1820             dz00             = _mm_sub_ps(iz0,jz0);
1821             dx01             = _mm_sub_ps(ix0,jx1);
1822             dy01             = _mm_sub_ps(iy0,jy1);
1823             dz01             = _mm_sub_ps(iz0,jz1);
1824             dx02             = _mm_sub_ps(ix0,jx2);
1825             dy02             = _mm_sub_ps(iy0,jy2);
1826             dz02             = _mm_sub_ps(iz0,jz2);
1827             dx10             = _mm_sub_ps(ix1,jx0);
1828             dy10             = _mm_sub_ps(iy1,jy0);
1829             dz10             = _mm_sub_ps(iz1,jz0);
1830             dx11             = _mm_sub_ps(ix1,jx1);
1831             dy11             = _mm_sub_ps(iy1,jy1);
1832             dz11             = _mm_sub_ps(iz1,jz1);
1833             dx12             = _mm_sub_ps(ix1,jx2);
1834             dy12             = _mm_sub_ps(iy1,jy2);
1835             dz12             = _mm_sub_ps(iz1,jz2);
1836             dx20             = _mm_sub_ps(ix2,jx0);
1837             dy20             = _mm_sub_ps(iy2,jy0);
1838             dz20             = _mm_sub_ps(iz2,jz0);
1839             dx21             = _mm_sub_ps(ix2,jx1);
1840             dy21             = _mm_sub_ps(iy2,jy1);
1841             dz21             = _mm_sub_ps(iz2,jz1);
1842             dx22             = _mm_sub_ps(ix2,jx2);
1843             dy22             = _mm_sub_ps(iy2,jy2);
1844             dz22             = _mm_sub_ps(iz2,jz2);
1845
1846             /* Calculate squared distance and things based on it */
1847             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1848             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1849             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1850             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1851             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1852             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1853             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1854             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1855             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1856
1857             rinv00           = avx128fma_invsqrt_f(rsq00);
1858             rinv01           = avx128fma_invsqrt_f(rsq01);
1859             rinv02           = avx128fma_invsqrt_f(rsq02);
1860             rinv10           = avx128fma_invsqrt_f(rsq10);
1861             rinv11           = avx128fma_invsqrt_f(rsq11);
1862             rinv12           = avx128fma_invsqrt_f(rsq12);
1863             rinv20           = avx128fma_invsqrt_f(rsq20);
1864             rinv21           = avx128fma_invsqrt_f(rsq21);
1865             rinv22           = avx128fma_invsqrt_f(rsq22);
1866
1867             fjx0             = _mm_setzero_ps();
1868             fjy0             = _mm_setzero_ps();
1869             fjz0             = _mm_setzero_ps();
1870             fjx1             = _mm_setzero_ps();
1871             fjy1             = _mm_setzero_ps();
1872             fjz1             = _mm_setzero_ps();
1873             fjx2             = _mm_setzero_ps();
1874             fjy2             = _mm_setzero_ps();
1875             fjz2             = _mm_setzero_ps();
1876
1877             /**************************
1878              * CALCULATE INTERACTIONS *
1879              **************************/
1880
1881             r00              = _mm_mul_ps(rsq00,rinv00);
1882             r00              = _mm_andnot_ps(dummy_mask,r00);
1883
1884             /* Calculate table index by multiplying r with table scale and truncate to integer */
1885             rt               = _mm_mul_ps(r00,vftabscale);
1886             vfitab           = _mm_cvttps_epi32(rt);
1887 #ifdef __XOP__
1888             vfeps            = _mm_frcz_ps(rt);
1889 #else
1890             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1891 #endif
1892             twovfeps         = _mm_add_ps(vfeps,vfeps);
1893             vfitab           = _mm_slli_epi32(vfitab,2);
1894
1895             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1896             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1897             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1898             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1899             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1900             _MM_TRANSPOSE4_PS(Y,F,G,H);
1901             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1902             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1903             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1904
1905             fscal            = felec;
1906
1907             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1908
1909              /* Update vectorial force */
1910             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1911             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1912             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1913
1914             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1915             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1916             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1917
1918             /**************************
1919              * CALCULATE INTERACTIONS *
1920              **************************/
1921
1922             r01              = _mm_mul_ps(rsq01,rinv01);
1923             r01              = _mm_andnot_ps(dummy_mask,r01);
1924
1925             /* Calculate table index by multiplying r with table scale and truncate to integer */
1926             rt               = _mm_mul_ps(r01,vftabscale);
1927             vfitab           = _mm_cvttps_epi32(rt);
1928 #ifdef __XOP__
1929             vfeps            = _mm_frcz_ps(rt);
1930 #else
1931             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1932 #endif
1933             twovfeps         = _mm_add_ps(vfeps,vfeps);
1934             vfitab           = _mm_slli_epi32(vfitab,2);
1935
1936             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1937             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1938             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1939             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1940             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1941             _MM_TRANSPOSE4_PS(Y,F,G,H);
1942             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1943             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1944             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1945
1946             fscal            = felec;
1947
1948             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1949
1950              /* Update vectorial force */
1951             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1952             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1953             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1954
1955             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1956             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1957             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1958
1959             /**************************
1960              * CALCULATE INTERACTIONS *
1961              **************************/
1962
1963             r02              = _mm_mul_ps(rsq02,rinv02);
1964             r02              = _mm_andnot_ps(dummy_mask,r02);
1965
1966             /* Calculate table index by multiplying r with table scale and truncate to integer */
1967             rt               = _mm_mul_ps(r02,vftabscale);
1968             vfitab           = _mm_cvttps_epi32(rt);
1969 #ifdef __XOP__
1970             vfeps            = _mm_frcz_ps(rt);
1971 #else
1972             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1973 #endif
1974             twovfeps         = _mm_add_ps(vfeps,vfeps);
1975             vfitab           = _mm_slli_epi32(vfitab,2);
1976
1977             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1978             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1979             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1980             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1981             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1982             _MM_TRANSPOSE4_PS(Y,F,G,H);
1983             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1984             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1985             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1986
1987             fscal            = felec;
1988
1989             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1990
1991              /* Update vectorial force */
1992             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1993             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1994             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1995
1996             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1997             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1998             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1999
2000             /**************************
2001              * CALCULATE INTERACTIONS *
2002              **************************/
2003
2004             r10              = _mm_mul_ps(rsq10,rinv10);
2005             r10              = _mm_andnot_ps(dummy_mask,r10);
2006
2007             /* Calculate table index by multiplying r with table scale and truncate to integer */
2008             rt               = _mm_mul_ps(r10,vftabscale);
2009             vfitab           = _mm_cvttps_epi32(rt);
2010 #ifdef __XOP__
2011             vfeps            = _mm_frcz_ps(rt);
2012 #else
2013             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2014 #endif
2015             twovfeps         = _mm_add_ps(vfeps,vfeps);
2016             vfitab           = _mm_slli_epi32(vfitab,2);
2017
2018             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2019             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2020             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2021             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2022             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2023             _MM_TRANSPOSE4_PS(Y,F,G,H);
2024             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2025             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2026             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
2027
2028             fscal            = felec;
2029
2030             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2031
2032              /* Update vectorial force */
2033             fix1             = _mm_macc_ps(dx10,fscal,fix1);
2034             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
2035             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
2036
2037             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
2038             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
2039             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
2040
2041             /**************************
2042              * CALCULATE INTERACTIONS *
2043              **************************/
2044
2045             r11              = _mm_mul_ps(rsq11,rinv11);
2046             r11              = _mm_andnot_ps(dummy_mask,r11);
2047
2048             /* Calculate table index by multiplying r with table scale and truncate to integer */
2049             rt               = _mm_mul_ps(r11,vftabscale);
2050             vfitab           = _mm_cvttps_epi32(rt);
2051 #ifdef __XOP__
2052             vfeps            = _mm_frcz_ps(rt);
2053 #else
2054             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2055 #endif
2056             twovfeps         = _mm_add_ps(vfeps,vfeps);
2057             vfitab           = _mm_slli_epi32(vfitab,2);
2058
2059             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2060             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2061             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2062             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2063             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2064             _MM_TRANSPOSE4_PS(Y,F,G,H);
2065             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2066             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2067             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2068
2069             fscal            = felec;
2070
2071             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2072
2073              /* Update vectorial force */
2074             fix1             = _mm_macc_ps(dx11,fscal,fix1);
2075             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
2076             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
2077
2078             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
2079             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
2080             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
2081
2082             /**************************
2083              * CALCULATE INTERACTIONS *
2084              **************************/
2085
2086             r12              = _mm_mul_ps(rsq12,rinv12);
2087             r12              = _mm_andnot_ps(dummy_mask,r12);
2088
2089             /* Calculate table index by multiplying r with table scale and truncate to integer */
2090             rt               = _mm_mul_ps(r12,vftabscale);
2091             vfitab           = _mm_cvttps_epi32(rt);
2092 #ifdef __XOP__
2093             vfeps            = _mm_frcz_ps(rt);
2094 #else
2095             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2096 #endif
2097             twovfeps         = _mm_add_ps(vfeps,vfeps);
2098             vfitab           = _mm_slli_epi32(vfitab,2);
2099
2100             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2101             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2102             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2103             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2104             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2105             _MM_TRANSPOSE4_PS(Y,F,G,H);
2106             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2107             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2108             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2109
2110             fscal            = felec;
2111
2112             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2113
2114              /* Update vectorial force */
2115             fix1             = _mm_macc_ps(dx12,fscal,fix1);
2116             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
2117             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
2118
2119             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
2120             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
2121             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
2122
2123             /**************************
2124              * CALCULATE INTERACTIONS *
2125              **************************/
2126
2127             r20              = _mm_mul_ps(rsq20,rinv20);
2128             r20              = _mm_andnot_ps(dummy_mask,r20);
2129
2130             /* Calculate table index by multiplying r with table scale and truncate to integer */
2131             rt               = _mm_mul_ps(r20,vftabscale);
2132             vfitab           = _mm_cvttps_epi32(rt);
2133 #ifdef __XOP__
2134             vfeps            = _mm_frcz_ps(rt);
2135 #else
2136             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2137 #endif
2138             twovfeps         = _mm_add_ps(vfeps,vfeps);
2139             vfitab           = _mm_slli_epi32(vfitab,2);
2140
2141             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2142             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2143             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2144             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2145             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2146             _MM_TRANSPOSE4_PS(Y,F,G,H);
2147             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2148             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2149             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
2150
2151             fscal            = felec;
2152
2153             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2154
2155              /* Update vectorial force */
2156             fix2             = _mm_macc_ps(dx20,fscal,fix2);
2157             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
2158             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
2159
2160             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
2161             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
2162             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
2163
2164             /**************************
2165              * CALCULATE INTERACTIONS *
2166              **************************/
2167
2168             r21              = _mm_mul_ps(rsq21,rinv21);
2169             r21              = _mm_andnot_ps(dummy_mask,r21);
2170
2171             /* Calculate table index by multiplying r with table scale and truncate to integer */
2172             rt               = _mm_mul_ps(r21,vftabscale);
2173             vfitab           = _mm_cvttps_epi32(rt);
2174 #ifdef __XOP__
2175             vfeps            = _mm_frcz_ps(rt);
2176 #else
2177             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2178 #endif
2179             twovfeps         = _mm_add_ps(vfeps,vfeps);
2180             vfitab           = _mm_slli_epi32(vfitab,2);
2181
2182             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2183             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2184             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2185             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2186             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2187             _MM_TRANSPOSE4_PS(Y,F,G,H);
2188             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2189             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2190             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2191
2192             fscal            = felec;
2193
2194             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2195
2196              /* Update vectorial force */
2197             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2198             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2199             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2200
2201             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2202             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2203             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2204
2205             /**************************
2206              * CALCULATE INTERACTIONS *
2207              **************************/
2208
2209             r22              = _mm_mul_ps(rsq22,rinv22);
2210             r22              = _mm_andnot_ps(dummy_mask,r22);
2211
2212             /* Calculate table index by multiplying r with table scale and truncate to integer */
2213             rt               = _mm_mul_ps(r22,vftabscale);
2214             vfitab           = _mm_cvttps_epi32(rt);
2215 #ifdef __XOP__
2216             vfeps            = _mm_frcz_ps(rt);
2217 #else
2218             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2219 #endif
2220             twovfeps         = _mm_add_ps(vfeps,vfeps);
2221             vfitab           = _mm_slli_epi32(vfitab,2);
2222
2223             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2224             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2225             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2226             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2227             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2228             _MM_TRANSPOSE4_PS(Y,F,G,H);
2229             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2230             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2231             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2232
2233             fscal            = felec;
2234
2235             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2236
2237              /* Update vectorial force */
2238             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2239             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2240             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2241
2242             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2243             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2244             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2245
2246             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2247             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2248             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2249             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2250
2251             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2252                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2253
2254             /* Inner loop uses 387 flops */
2255         }
2256
2257         /* End of innermost loop */
2258
2259         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2260                                               f+i_coord_offset,fshift+i_shift_offset);
2261
2262         /* Increment number of inner iterations */
2263         inneriter                  += j_index_end - j_index_start;
2264
2265         /* Outer loop uses 18 flops */
2266     }
2267
2268     /* Increment number of outer iterations */
2269     outeriter        += nri;
2270
2271     /* Update outer/inner flops */
2272
2273     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*387);
2274 }