6358446dc8d3106b2b981537e3d647d1172fab27
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecCSTab_VdwNone_GeomW4W4_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
50 #include "kernelutil_x86_avx_128_fma_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_128_fma_single
54  * Electrostatics interaction: CubicSplineTable
55  * VdW interaction:            None
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_128_fma_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwioffset3;
90     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
96     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
97     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     __m128i          vfitab;
109     __m128i          ifour       = _mm_set1_epi32(4);
110     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
111     real             *vftab;
112     __m128           dummy_mask,cutoff_mask;
113     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
114     __m128           one     = _mm_set1_ps(1.0);
115     __m128           two     = _mm_set1_ps(2.0);
116     x                = xx[0];
117     f                = ff[0];
118
119     nri              = nlist->nri;
120     iinr             = nlist->iinr;
121     jindex           = nlist->jindex;
122     jjnr             = nlist->jjnr;
123     shiftidx         = nlist->shift;
124     gid              = nlist->gid;
125     shiftvec         = fr->shift_vec[0];
126     fshift           = fr->fshift[0];
127     facel            = _mm_set1_ps(fr->epsfac);
128     charge           = mdatoms->chargeA;
129
130     vftab            = kernel_data->table_elec->data;
131     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
132
133     /* Setup water-specific parameters */
134     inr              = nlist->iinr[0];
135     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
136     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
137     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
138
139     jq1              = _mm_set1_ps(charge[inr+1]);
140     jq2              = _mm_set1_ps(charge[inr+2]);
141     jq3              = _mm_set1_ps(charge[inr+3]);
142     qq11             = _mm_mul_ps(iq1,jq1);
143     qq12             = _mm_mul_ps(iq1,jq2);
144     qq13             = _mm_mul_ps(iq1,jq3);
145     qq21             = _mm_mul_ps(iq2,jq1);
146     qq22             = _mm_mul_ps(iq2,jq2);
147     qq23             = _mm_mul_ps(iq2,jq3);
148     qq31             = _mm_mul_ps(iq3,jq1);
149     qq32             = _mm_mul_ps(iq3,jq2);
150     qq33             = _mm_mul_ps(iq3,jq3);
151
152     /* Avoid stupid compiler warnings */
153     jnrA = jnrB = jnrC = jnrD = 0;
154     j_coord_offsetA = 0;
155     j_coord_offsetB = 0;
156     j_coord_offsetC = 0;
157     j_coord_offsetD = 0;
158
159     outeriter        = 0;
160     inneriter        = 0;
161
162     for(iidx=0;iidx<4*DIM;iidx++)
163     {
164         scratch[iidx] = 0.0;
165     }
166
167     /* Start outer loop over neighborlists */
168     for(iidx=0; iidx<nri; iidx++)
169     {
170         /* Load shift vector for this list */
171         i_shift_offset   = DIM*shiftidx[iidx];
172
173         /* Load limits for loop over neighbors */
174         j_index_start    = jindex[iidx];
175         j_index_end      = jindex[iidx+1];
176
177         /* Get outer coordinate index */
178         inr              = iinr[iidx];
179         i_coord_offset   = DIM*inr;
180
181         /* Load i particle coords and add shift vector */
182         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
183                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
184
185         fix1             = _mm_setzero_ps();
186         fiy1             = _mm_setzero_ps();
187         fiz1             = _mm_setzero_ps();
188         fix2             = _mm_setzero_ps();
189         fiy2             = _mm_setzero_ps();
190         fiz2             = _mm_setzero_ps();
191         fix3             = _mm_setzero_ps();
192         fiy3             = _mm_setzero_ps();
193         fiz3             = _mm_setzero_ps();
194
195         /* Reset potential sums */
196         velecsum         = _mm_setzero_ps();
197
198         /* Start inner kernel loop */
199         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
200         {
201
202             /* Get j neighbor index, and coordinate index */
203             jnrA             = jjnr[jidx];
204             jnrB             = jjnr[jidx+1];
205             jnrC             = jjnr[jidx+2];
206             jnrD             = jjnr[jidx+3];
207             j_coord_offsetA  = DIM*jnrA;
208             j_coord_offsetB  = DIM*jnrB;
209             j_coord_offsetC  = DIM*jnrC;
210             j_coord_offsetD  = DIM*jnrD;
211
212             /* load j atom coordinates */
213             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
214                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
215                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
216
217             /* Calculate displacement vector */
218             dx11             = _mm_sub_ps(ix1,jx1);
219             dy11             = _mm_sub_ps(iy1,jy1);
220             dz11             = _mm_sub_ps(iz1,jz1);
221             dx12             = _mm_sub_ps(ix1,jx2);
222             dy12             = _mm_sub_ps(iy1,jy2);
223             dz12             = _mm_sub_ps(iz1,jz2);
224             dx13             = _mm_sub_ps(ix1,jx3);
225             dy13             = _mm_sub_ps(iy1,jy3);
226             dz13             = _mm_sub_ps(iz1,jz3);
227             dx21             = _mm_sub_ps(ix2,jx1);
228             dy21             = _mm_sub_ps(iy2,jy1);
229             dz21             = _mm_sub_ps(iz2,jz1);
230             dx22             = _mm_sub_ps(ix2,jx2);
231             dy22             = _mm_sub_ps(iy2,jy2);
232             dz22             = _mm_sub_ps(iz2,jz2);
233             dx23             = _mm_sub_ps(ix2,jx3);
234             dy23             = _mm_sub_ps(iy2,jy3);
235             dz23             = _mm_sub_ps(iz2,jz3);
236             dx31             = _mm_sub_ps(ix3,jx1);
237             dy31             = _mm_sub_ps(iy3,jy1);
238             dz31             = _mm_sub_ps(iz3,jz1);
239             dx32             = _mm_sub_ps(ix3,jx2);
240             dy32             = _mm_sub_ps(iy3,jy2);
241             dz32             = _mm_sub_ps(iz3,jz2);
242             dx33             = _mm_sub_ps(ix3,jx3);
243             dy33             = _mm_sub_ps(iy3,jy3);
244             dz33             = _mm_sub_ps(iz3,jz3);
245
246             /* Calculate squared distance and things based on it */
247             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
248             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
249             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
250             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
251             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
252             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
253             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
254             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
255             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
256
257             rinv11           = gmx_mm_invsqrt_ps(rsq11);
258             rinv12           = gmx_mm_invsqrt_ps(rsq12);
259             rinv13           = gmx_mm_invsqrt_ps(rsq13);
260             rinv21           = gmx_mm_invsqrt_ps(rsq21);
261             rinv22           = gmx_mm_invsqrt_ps(rsq22);
262             rinv23           = gmx_mm_invsqrt_ps(rsq23);
263             rinv31           = gmx_mm_invsqrt_ps(rsq31);
264             rinv32           = gmx_mm_invsqrt_ps(rsq32);
265             rinv33           = gmx_mm_invsqrt_ps(rsq33);
266
267             fjx1             = _mm_setzero_ps();
268             fjy1             = _mm_setzero_ps();
269             fjz1             = _mm_setzero_ps();
270             fjx2             = _mm_setzero_ps();
271             fjy2             = _mm_setzero_ps();
272             fjz2             = _mm_setzero_ps();
273             fjx3             = _mm_setzero_ps();
274             fjy3             = _mm_setzero_ps();
275             fjz3             = _mm_setzero_ps();
276
277             /**************************
278              * CALCULATE INTERACTIONS *
279              **************************/
280
281             r11              = _mm_mul_ps(rsq11,rinv11);
282
283             /* Calculate table index by multiplying r with table scale and truncate to integer */
284             rt               = _mm_mul_ps(r11,vftabscale);
285             vfitab           = _mm_cvttps_epi32(rt);
286 #ifdef __XOP__
287             vfeps            = _mm_frcz_ps(rt);
288 #else
289             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
290 #endif
291             twovfeps         = _mm_add_ps(vfeps,vfeps);
292             vfitab           = _mm_slli_epi32(vfitab,2);
293
294             /* CUBIC SPLINE TABLE ELECTROSTATICS */
295             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
296             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
297             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
298             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
299             _MM_TRANSPOSE4_PS(Y,F,G,H);
300             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
301             VV               = _mm_macc_ps(vfeps,Fp,Y);
302             velec            = _mm_mul_ps(qq11,VV);
303             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
304             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
305
306             /* Update potential sum for this i atom from the interaction with this j atom. */
307             velecsum         = _mm_add_ps(velecsum,velec);
308
309             fscal            = felec;
310
311              /* Update vectorial force */
312             fix1             = _mm_macc_ps(dx11,fscal,fix1);
313             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
314             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
315
316             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
317             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
318             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
319
320             /**************************
321              * CALCULATE INTERACTIONS *
322              **************************/
323
324             r12              = _mm_mul_ps(rsq12,rinv12);
325
326             /* Calculate table index by multiplying r with table scale and truncate to integer */
327             rt               = _mm_mul_ps(r12,vftabscale);
328             vfitab           = _mm_cvttps_epi32(rt);
329 #ifdef __XOP__
330             vfeps            = _mm_frcz_ps(rt);
331 #else
332             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
333 #endif
334             twovfeps         = _mm_add_ps(vfeps,vfeps);
335             vfitab           = _mm_slli_epi32(vfitab,2);
336
337             /* CUBIC SPLINE TABLE ELECTROSTATICS */
338             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
339             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
340             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
341             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
342             _MM_TRANSPOSE4_PS(Y,F,G,H);
343             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
344             VV               = _mm_macc_ps(vfeps,Fp,Y);
345             velec            = _mm_mul_ps(qq12,VV);
346             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
347             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
348
349             /* Update potential sum for this i atom from the interaction with this j atom. */
350             velecsum         = _mm_add_ps(velecsum,velec);
351
352             fscal            = felec;
353
354              /* Update vectorial force */
355             fix1             = _mm_macc_ps(dx12,fscal,fix1);
356             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
357             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
358
359             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
360             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
361             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
362
363             /**************************
364              * CALCULATE INTERACTIONS *
365              **************************/
366
367             r13              = _mm_mul_ps(rsq13,rinv13);
368
369             /* Calculate table index by multiplying r with table scale and truncate to integer */
370             rt               = _mm_mul_ps(r13,vftabscale);
371             vfitab           = _mm_cvttps_epi32(rt);
372 #ifdef __XOP__
373             vfeps            = _mm_frcz_ps(rt);
374 #else
375             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
376 #endif
377             twovfeps         = _mm_add_ps(vfeps,vfeps);
378             vfitab           = _mm_slli_epi32(vfitab,2);
379
380             /* CUBIC SPLINE TABLE ELECTROSTATICS */
381             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
382             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
383             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
384             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
385             _MM_TRANSPOSE4_PS(Y,F,G,H);
386             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
387             VV               = _mm_macc_ps(vfeps,Fp,Y);
388             velec            = _mm_mul_ps(qq13,VV);
389             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
390             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
391
392             /* Update potential sum for this i atom from the interaction with this j atom. */
393             velecsum         = _mm_add_ps(velecsum,velec);
394
395             fscal            = felec;
396
397              /* Update vectorial force */
398             fix1             = _mm_macc_ps(dx13,fscal,fix1);
399             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
400             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
401
402             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
403             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
404             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
405
406             /**************************
407              * CALCULATE INTERACTIONS *
408              **************************/
409
410             r21              = _mm_mul_ps(rsq21,rinv21);
411
412             /* Calculate table index by multiplying r with table scale and truncate to integer */
413             rt               = _mm_mul_ps(r21,vftabscale);
414             vfitab           = _mm_cvttps_epi32(rt);
415 #ifdef __XOP__
416             vfeps            = _mm_frcz_ps(rt);
417 #else
418             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
419 #endif
420             twovfeps         = _mm_add_ps(vfeps,vfeps);
421             vfitab           = _mm_slli_epi32(vfitab,2);
422
423             /* CUBIC SPLINE TABLE ELECTROSTATICS */
424             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
425             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
426             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
427             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
428             _MM_TRANSPOSE4_PS(Y,F,G,H);
429             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
430             VV               = _mm_macc_ps(vfeps,Fp,Y);
431             velec            = _mm_mul_ps(qq21,VV);
432             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
433             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
434
435             /* Update potential sum for this i atom from the interaction with this j atom. */
436             velecsum         = _mm_add_ps(velecsum,velec);
437
438             fscal            = felec;
439
440              /* Update vectorial force */
441             fix2             = _mm_macc_ps(dx21,fscal,fix2);
442             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
443             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
444
445             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
446             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
447             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
448
449             /**************************
450              * CALCULATE INTERACTIONS *
451              **************************/
452
453             r22              = _mm_mul_ps(rsq22,rinv22);
454
455             /* Calculate table index by multiplying r with table scale and truncate to integer */
456             rt               = _mm_mul_ps(r22,vftabscale);
457             vfitab           = _mm_cvttps_epi32(rt);
458 #ifdef __XOP__
459             vfeps            = _mm_frcz_ps(rt);
460 #else
461             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
462 #endif
463             twovfeps         = _mm_add_ps(vfeps,vfeps);
464             vfitab           = _mm_slli_epi32(vfitab,2);
465
466             /* CUBIC SPLINE TABLE ELECTROSTATICS */
467             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
468             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
469             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
470             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
471             _MM_TRANSPOSE4_PS(Y,F,G,H);
472             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
473             VV               = _mm_macc_ps(vfeps,Fp,Y);
474             velec            = _mm_mul_ps(qq22,VV);
475             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
476             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
477
478             /* Update potential sum for this i atom from the interaction with this j atom. */
479             velecsum         = _mm_add_ps(velecsum,velec);
480
481             fscal            = felec;
482
483              /* Update vectorial force */
484             fix2             = _mm_macc_ps(dx22,fscal,fix2);
485             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
486             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
487
488             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
489             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
490             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
491
492             /**************************
493              * CALCULATE INTERACTIONS *
494              **************************/
495
496             r23              = _mm_mul_ps(rsq23,rinv23);
497
498             /* Calculate table index by multiplying r with table scale and truncate to integer */
499             rt               = _mm_mul_ps(r23,vftabscale);
500             vfitab           = _mm_cvttps_epi32(rt);
501 #ifdef __XOP__
502             vfeps            = _mm_frcz_ps(rt);
503 #else
504             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
505 #endif
506             twovfeps         = _mm_add_ps(vfeps,vfeps);
507             vfitab           = _mm_slli_epi32(vfitab,2);
508
509             /* CUBIC SPLINE TABLE ELECTROSTATICS */
510             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
511             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
512             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
513             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
514             _MM_TRANSPOSE4_PS(Y,F,G,H);
515             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
516             VV               = _mm_macc_ps(vfeps,Fp,Y);
517             velec            = _mm_mul_ps(qq23,VV);
518             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
519             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
520
521             /* Update potential sum for this i atom from the interaction with this j atom. */
522             velecsum         = _mm_add_ps(velecsum,velec);
523
524             fscal            = felec;
525
526              /* Update vectorial force */
527             fix2             = _mm_macc_ps(dx23,fscal,fix2);
528             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
529             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
530
531             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
532             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
533             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
534
535             /**************************
536              * CALCULATE INTERACTIONS *
537              **************************/
538
539             r31              = _mm_mul_ps(rsq31,rinv31);
540
541             /* Calculate table index by multiplying r with table scale and truncate to integer */
542             rt               = _mm_mul_ps(r31,vftabscale);
543             vfitab           = _mm_cvttps_epi32(rt);
544 #ifdef __XOP__
545             vfeps            = _mm_frcz_ps(rt);
546 #else
547             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
548 #endif
549             twovfeps         = _mm_add_ps(vfeps,vfeps);
550             vfitab           = _mm_slli_epi32(vfitab,2);
551
552             /* CUBIC SPLINE TABLE ELECTROSTATICS */
553             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
554             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
555             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
556             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
557             _MM_TRANSPOSE4_PS(Y,F,G,H);
558             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
559             VV               = _mm_macc_ps(vfeps,Fp,Y);
560             velec            = _mm_mul_ps(qq31,VV);
561             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
562             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
563
564             /* Update potential sum for this i atom from the interaction with this j atom. */
565             velecsum         = _mm_add_ps(velecsum,velec);
566
567             fscal            = felec;
568
569              /* Update vectorial force */
570             fix3             = _mm_macc_ps(dx31,fscal,fix3);
571             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
572             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
573
574             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
575             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
576             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
577
578             /**************************
579              * CALCULATE INTERACTIONS *
580              **************************/
581
582             r32              = _mm_mul_ps(rsq32,rinv32);
583
584             /* Calculate table index by multiplying r with table scale and truncate to integer */
585             rt               = _mm_mul_ps(r32,vftabscale);
586             vfitab           = _mm_cvttps_epi32(rt);
587 #ifdef __XOP__
588             vfeps            = _mm_frcz_ps(rt);
589 #else
590             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
591 #endif
592             twovfeps         = _mm_add_ps(vfeps,vfeps);
593             vfitab           = _mm_slli_epi32(vfitab,2);
594
595             /* CUBIC SPLINE TABLE ELECTROSTATICS */
596             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
597             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
598             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
599             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
600             _MM_TRANSPOSE4_PS(Y,F,G,H);
601             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
602             VV               = _mm_macc_ps(vfeps,Fp,Y);
603             velec            = _mm_mul_ps(qq32,VV);
604             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
605             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
606
607             /* Update potential sum for this i atom from the interaction with this j atom. */
608             velecsum         = _mm_add_ps(velecsum,velec);
609
610             fscal            = felec;
611
612              /* Update vectorial force */
613             fix3             = _mm_macc_ps(dx32,fscal,fix3);
614             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
615             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
616
617             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
618             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
619             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
620
621             /**************************
622              * CALCULATE INTERACTIONS *
623              **************************/
624
625             r33              = _mm_mul_ps(rsq33,rinv33);
626
627             /* Calculate table index by multiplying r with table scale and truncate to integer */
628             rt               = _mm_mul_ps(r33,vftabscale);
629             vfitab           = _mm_cvttps_epi32(rt);
630 #ifdef __XOP__
631             vfeps            = _mm_frcz_ps(rt);
632 #else
633             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
634 #endif
635             twovfeps         = _mm_add_ps(vfeps,vfeps);
636             vfitab           = _mm_slli_epi32(vfitab,2);
637
638             /* CUBIC SPLINE TABLE ELECTROSTATICS */
639             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
640             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
641             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
642             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
643             _MM_TRANSPOSE4_PS(Y,F,G,H);
644             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
645             VV               = _mm_macc_ps(vfeps,Fp,Y);
646             velec            = _mm_mul_ps(qq33,VV);
647             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
648             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
649
650             /* Update potential sum for this i atom from the interaction with this j atom. */
651             velecsum         = _mm_add_ps(velecsum,velec);
652
653             fscal            = felec;
654
655              /* Update vectorial force */
656             fix3             = _mm_macc_ps(dx33,fscal,fix3);
657             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
658             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
659
660             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
661             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
662             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
663
664             fjptrA             = f+j_coord_offsetA;
665             fjptrB             = f+j_coord_offsetB;
666             fjptrC             = f+j_coord_offsetC;
667             fjptrD             = f+j_coord_offsetD;
668
669             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
670                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
671
672             /* Inner loop uses 414 flops */
673         }
674
675         if(jidx<j_index_end)
676         {
677
678             /* Get j neighbor index, and coordinate index */
679             jnrlistA         = jjnr[jidx];
680             jnrlistB         = jjnr[jidx+1];
681             jnrlistC         = jjnr[jidx+2];
682             jnrlistD         = jjnr[jidx+3];
683             /* Sign of each element will be negative for non-real atoms.
684              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
685              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
686              */
687             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
688             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
689             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
690             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
691             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
692             j_coord_offsetA  = DIM*jnrA;
693             j_coord_offsetB  = DIM*jnrB;
694             j_coord_offsetC  = DIM*jnrC;
695             j_coord_offsetD  = DIM*jnrD;
696
697             /* load j atom coordinates */
698             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
699                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
700                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
701
702             /* Calculate displacement vector */
703             dx11             = _mm_sub_ps(ix1,jx1);
704             dy11             = _mm_sub_ps(iy1,jy1);
705             dz11             = _mm_sub_ps(iz1,jz1);
706             dx12             = _mm_sub_ps(ix1,jx2);
707             dy12             = _mm_sub_ps(iy1,jy2);
708             dz12             = _mm_sub_ps(iz1,jz2);
709             dx13             = _mm_sub_ps(ix1,jx3);
710             dy13             = _mm_sub_ps(iy1,jy3);
711             dz13             = _mm_sub_ps(iz1,jz3);
712             dx21             = _mm_sub_ps(ix2,jx1);
713             dy21             = _mm_sub_ps(iy2,jy1);
714             dz21             = _mm_sub_ps(iz2,jz1);
715             dx22             = _mm_sub_ps(ix2,jx2);
716             dy22             = _mm_sub_ps(iy2,jy2);
717             dz22             = _mm_sub_ps(iz2,jz2);
718             dx23             = _mm_sub_ps(ix2,jx3);
719             dy23             = _mm_sub_ps(iy2,jy3);
720             dz23             = _mm_sub_ps(iz2,jz3);
721             dx31             = _mm_sub_ps(ix3,jx1);
722             dy31             = _mm_sub_ps(iy3,jy1);
723             dz31             = _mm_sub_ps(iz3,jz1);
724             dx32             = _mm_sub_ps(ix3,jx2);
725             dy32             = _mm_sub_ps(iy3,jy2);
726             dz32             = _mm_sub_ps(iz3,jz2);
727             dx33             = _mm_sub_ps(ix3,jx3);
728             dy33             = _mm_sub_ps(iy3,jy3);
729             dz33             = _mm_sub_ps(iz3,jz3);
730
731             /* Calculate squared distance and things based on it */
732             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
733             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
734             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
735             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
736             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
737             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
738             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
739             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
740             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
741
742             rinv11           = gmx_mm_invsqrt_ps(rsq11);
743             rinv12           = gmx_mm_invsqrt_ps(rsq12);
744             rinv13           = gmx_mm_invsqrt_ps(rsq13);
745             rinv21           = gmx_mm_invsqrt_ps(rsq21);
746             rinv22           = gmx_mm_invsqrt_ps(rsq22);
747             rinv23           = gmx_mm_invsqrt_ps(rsq23);
748             rinv31           = gmx_mm_invsqrt_ps(rsq31);
749             rinv32           = gmx_mm_invsqrt_ps(rsq32);
750             rinv33           = gmx_mm_invsqrt_ps(rsq33);
751
752             fjx1             = _mm_setzero_ps();
753             fjy1             = _mm_setzero_ps();
754             fjz1             = _mm_setzero_ps();
755             fjx2             = _mm_setzero_ps();
756             fjy2             = _mm_setzero_ps();
757             fjz2             = _mm_setzero_ps();
758             fjx3             = _mm_setzero_ps();
759             fjy3             = _mm_setzero_ps();
760             fjz3             = _mm_setzero_ps();
761
762             /**************************
763              * CALCULATE INTERACTIONS *
764              **************************/
765
766             r11              = _mm_mul_ps(rsq11,rinv11);
767             r11              = _mm_andnot_ps(dummy_mask,r11);
768
769             /* Calculate table index by multiplying r with table scale and truncate to integer */
770             rt               = _mm_mul_ps(r11,vftabscale);
771             vfitab           = _mm_cvttps_epi32(rt);
772 #ifdef __XOP__
773             vfeps            = _mm_frcz_ps(rt);
774 #else
775             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
776 #endif
777             twovfeps         = _mm_add_ps(vfeps,vfeps);
778             vfitab           = _mm_slli_epi32(vfitab,2);
779
780             /* CUBIC SPLINE TABLE ELECTROSTATICS */
781             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
782             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
783             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
784             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
785             _MM_TRANSPOSE4_PS(Y,F,G,H);
786             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
787             VV               = _mm_macc_ps(vfeps,Fp,Y);
788             velec            = _mm_mul_ps(qq11,VV);
789             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
790             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
791
792             /* Update potential sum for this i atom from the interaction with this j atom. */
793             velec            = _mm_andnot_ps(dummy_mask,velec);
794             velecsum         = _mm_add_ps(velecsum,velec);
795
796             fscal            = felec;
797
798             fscal            = _mm_andnot_ps(dummy_mask,fscal);
799
800              /* Update vectorial force */
801             fix1             = _mm_macc_ps(dx11,fscal,fix1);
802             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
803             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
804
805             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
806             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
807             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
808
809             /**************************
810              * CALCULATE INTERACTIONS *
811              **************************/
812
813             r12              = _mm_mul_ps(rsq12,rinv12);
814             r12              = _mm_andnot_ps(dummy_mask,r12);
815
816             /* Calculate table index by multiplying r with table scale and truncate to integer */
817             rt               = _mm_mul_ps(r12,vftabscale);
818             vfitab           = _mm_cvttps_epi32(rt);
819 #ifdef __XOP__
820             vfeps            = _mm_frcz_ps(rt);
821 #else
822             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
823 #endif
824             twovfeps         = _mm_add_ps(vfeps,vfeps);
825             vfitab           = _mm_slli_epi32(vfitab,2);
826
827             /* CUBIC SPLINE TABLE ELECTROSTATICS */
828             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
829             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
830             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
831             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
832             _MM_TRANSPOSE4_PS(Y,F,G,H);
833             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
834             VV               = _mm_macc_ps(vfeps,Fp,Y);
835             velec            = _mm_mul_ps(qq12,VV);
836             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
837             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
838
839             /* Update potential sum for this i atom from the interaction with this j atom. */
840             velec            = _mm_andnot_ps(dummy_mask,velec);
841             velecsum         = _mm_add_ps(velecsum,velec);
842
843             fscal            = felec;
844
845             fscal            = _mm_andnot_ps(dummy_mask,fscal);
846
847              /* Update vectorial force */
848             fix1             = _mm_macc_ps(dx12,fscal,fix1);
849             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
850             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
851
852             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
853             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
854             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
855
856             /**************************
857              * CALCULATE INTERACTIONS *
858              **************************/
859
860             r13              = _mm_mul_ps(rsq13,rinv13);
861             r13              = _mm_andnot_ps(dummy_mask,r13);
862
863             /* Calculate table index by multiplying r with table scale and truncate to integer */
864             rt               = _mm_mul_ps(r13,vftabscale);
865             vfitab           = _mm_cvttps_epi32(rt);
866 #ifdef __XOP__
867             vfeps            = _mm_frcz_ps(rt);
868 #else
869             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
870 #endif
871             twovfeps         = _mm_add_ps(vfeps,vfeps);
872             vfitab           = _mm_slli_epi32(vfitab,2);
873
874             /* CUBIC SPLINE TABLE ELECTROSTATICS */
875             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
876             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
877             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
878             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
879             _MM_TRANSPOSE4_PS(Y,F,G,H);
880             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
881             VV               = _mm_macc_ps(vfeps,Fp,Y);
882             velec            = _mm_mul_ps(qq13,VV);
883             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
884             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
885
886             /* Update potential sum for this i atom from the interaction with this j atom. */
887             velec            = _mm_andnot_ps(dummy_mask,velec);
888             velecsum         = _mm_add_ps(velecsum,velec);
889
890             fscal            = felec;
891
892             fscal            = _mm_andnot_ps(dummy_mask,fscal);
893
894              /* Update vectorial force */
895             fix1             = _mm_macc_ps(dx13,fscal,fix1);
896             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
897             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
898
899             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
900             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
901             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
902
903             /**************************
904              * CALCULATE INTERACTIONS *
905              **************************/
906
907             r21              = _mm_mul_ps(rsq21,rinv21);
908             r21              = _mm_andnot_ps(dummy_mask,r21);
909
910             /* Calculate table index by multiplying r with table scale and truncate to integer */
911             rt               = _mm_mul_ps(r21,vftabscale);
912             vfitab           = _mm_cvttps_epi32(rt);
913 #ifdef __XOP__
914             vfeps            = _mm_frcz_ps(rt);
915 #else
916             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
917 #endif
918             twovfeps         = _mm_add_ps(vfeps,vfeps);
919             vfitab           = _mm_slli_epi32(vfitab,2);
920
921             /* CUBIC SPLINE TABLE ELECTROSTATICS */
922             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
923             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
924             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
925             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
926             _MM_TRANSPOSE4_PS(Y,F,G,H);
927             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
928             VV               = _mm_macc_ps(vfeps,Fp,Y);
929             velec            = _mm_mul_ps(qq21,VV);
930             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
931             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
932
933             /* Update potential sum for this i atom from the interaction with this j atom. */
934             velec            = _mm_andnot_ps(dummy_mask,velec);
935             velecsum         = _mm_add_ps(velecsum,velec);
936
937             fscal            = felec;
938
939             fscal            = _mm_andnot_ps(dummy_mask,fscal);
940
941              /* Update vectorial force */
942             fix2             = _mm_macc_ps(dx21,fscal,fix2);
943             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
944             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
945
946             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
947             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
948             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
949
950             /**************************
951              * CALCULATE INTERACTIONS *
952              **************************/
953
954             r22              = _mm_mul_ps(rsq22,rinv22);
955             r22              = _mm_andnot_ps(dummy_mask,r22);
956
957             /* Calculate table index by multiplying r with table scale and truncate to integer */
958             rt               = _mm_mul_ps(r22,vftabscale);
959             vfitab           = _mm_cvttps_epi32(rt);
960 #ifdef __XOP__
961             vfeps            = _mm_frcz_ps(rt);
962 #else
963             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
964 #endif
965             twovfeps         = _mm_add_ps(vfeps,vfeps);
966             vfitab           = _mm_slli_epi32(vfitab,2);
967
968             /* CUBIC SPLINE TABLE ELECTROSTATICS */
969             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
970             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
971             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
972             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
973             _MM_TRANSPOSE4_PS(Y,F,G,H);
974             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
975             VV               = _mm_macc_ps(vfeps,Fp,Y);
976             velec            = _mm_mul_ps(qq22,VV);
977             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
978             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
979
980             /* Update potential sum for this i atom from the interaction with this j atom. */
981             velec            = _mm_andnot_ps(dummy_mask,velec);
982             velecsum         = _mm_add_ps(velecsum,velec);
983
984             fscal            = felec;
985
986             fscal            = _mm_andnot_ps(dummy_mask,fscal);
987
988              /* Update vectorial force */
989             fix2             = _mm_macc_ps(dx22,fscal,fix2);
990             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
991             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
992
993             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
994             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
995             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
996
997             /**************************
998              * CALCULATE INTERACTIONS *
999              **************************/
1000
1001             r23              = _mm_mul_ps(rsq23,rinv23);
1002             r23              = _mm_andnot_ps(dummy_mask,r23);
1003
1004             /* Calculate table index by multiplying r with table scale and truncate to integer */
1005             rt               = _mm_mul_ps(r23,vftabscale);
1006             vfitab           = _mm_cvttps_epi32(rt);
1007 #ifdef __XOP__
1008             vfeps            = _mm_frcz_ps(rt);
1009 #else
1010             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1011 #endif
1012             twovfeps         = _mm_add_ps(vfeps,vfeps);
1013             vfitab           = _mm_slli_epi32(vfitab,2);
1014
1015             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1016             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1017             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1018             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1019             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1020             _MM_TRANSPOSE4_PS(Y,F,G,H);
1021             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1022             VV               = _mm_macc_ps(vfeps,Fp,Y);
1023             velec            = _mm_mul_ps(qq23,VV);
1024             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1025             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1026
1027             /* Update potential sum for this i atom from the interaction with this j atom. */
1028             velec            = _mm_andnot_ps(dummy_mask,velec);
1029             velecsum         = _mm_add_ps(velecsum,velec);
1030
1031             fscal            = felec;
1032
1033             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1034
1035              /* Update vectorial force */
1036             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1037             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1038             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1039
1040             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1041             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1042             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1043
1044             /**************************
1045              * CALCULATE INTERACTIONS *
1046              **************************/
1047
1048             r31              = _mm_mul_ps(rsq31,rinv31);
1049             r31              = _mm_andnot_ps(dummy_mask,r31);
1050
1051             /* Calculate table index by multiplying r with table scale and truncate to integer */
1052             rt               = _mm_mul_ps(r31,vftabscale);
1053             vfitab           = _mm_cvttps_epi32(rt);
1054 #ifdef __XOP__
1055             vfeps            = _mm_frcz_ps(rt);
1056 #else
1057             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1058 #endif
1059             twovfeps         = _mm_add_ps(vfeps,vfeps);
1060             vfitab           = _mm_slli_epi32(vfitab,2);
1061
1062             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1063             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1064             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1065             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1066             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1067             _MM_TRANSPOSE4_PS(Y,F,G,H);
1068             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1069             VV               = _mm_macc_ps(vfeps,Fp,Y);
1070             velec            = _mm_mul_ps(qq31,VV);
1071             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1072             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1073
1074             /* Update potential sum for this i atom from the interaction with this j atom. */
1075             velec            = _mm_andnot_ps(dummy_mask,velec);
1076             velecsum         = _mm_add_ps(velecsum,velec);
1077
1078             fscal            = felec;
1079
1080             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1081
1082              /* Update vectorial force */
1083             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1084             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1085             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1086
1087             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1088             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1089             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1090
1091             /**************************
1092              * CALCULATE INTERACTIONS *
1093              **************************/
1094
1095             r32              = _mm_mul_ps(rsq32,rinv32);
1096             r32              = _mm_andnot_ps(dummy_mask,r32);
1097
1098             /* Calculate table index by multiplying r with table scale and truncate to integer */
1099             rt               = _mm_mul_ps(r32,vftabscale);
1100             vfitab           = _mm_cvttps_epi32(rt);
1101 #ifdef __XOP__
1102             vfeps            = _mm_frcz_ps(rt);
1103 #else
1104             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1105 #endif
1106             twovfeps         = _mm_add_ps(vfeps,vfeps);
1107             vfitab           = _mm_slli_epi32(vfitab,2);
1108
1109             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1110             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1111             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1112             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1113             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1114             _MM_TRANSPOSE4_PS(Y,F,G,H);
1115             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1116             VV               = _mm_macc_ps(vfeps,Fp,Y);
1117             velec            = _mm_mul_ps(qq32,VV);
1118             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1119             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1120
1121             /* Update potential sum for this i atom from the interaction with this j atom. */
1122             velec            = _mm_andnot_ps(dummy_mask,velec);
1123             velecsum         = _mm_add_ps(velecsum,velec);
1124
1125             fscal            = felec;
1126
1127             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1128
1129              /* Update vectorial force */
1130             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1131             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1132             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1133
1134             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1135             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1136             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1137
1138             /**************************
1139              * CALCULATE INTERACTIONS *
1140              **************************/
1141
1142             r33              = _mm_mul_ps(rsq33,rinv33);
1143             r33              = _mm_andnot_ps(dummy_mask,r33);
1144
1145             /* Calculate table index by multiplying r with table scale and truncate to integer */
1146             rt               = _mm_mul_ps(r33,vftabscale);
1147             vfitab           = _mm_cvttps_epi32(rt);
1148 #ifdef __XOP__
1149             vfeps            = _mm_frcz_ps(rt);
1150 #else
1151             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1152 #endif
1153             twovfeps         = _mm_add_ps(vfeps,vfeps);
1154             vfitab           = _mm_slli_epi32(vfitab,2);
1155
1156             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1157             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1158             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1159             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1160             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1161             _MM_TRANSPOSE4_PS(Y,F,G,H);
1162             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1163             VV               = _mm_macc_ps(vfeps,Fp,Y);
1164             velec            = _mm_mul_ps(qq33,VV);
1165             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1166             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1167
1168             /* Update potential sum for this i atom from the interaction with this j atom. */
1169             velec            = _mm_andnot_ps(dummy_mask,velec);
1170             velecsum         = _mm_add_ps(velecsum,velec);
1171
1172             fscal            = felec;
1173
1174             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1175
1176              /* Update vectorial force */
1177             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1178             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1179             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1180
1181             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1182             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1183             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1184
1185             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1186             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1187             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1188             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1189
1190             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1191                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1192
1193             /* Inner loop uses 423 flops */
1194         }
1195
1196         /* End of innermost loop */
1197
1198         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1199                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
1200
1201         ggid                        = gid[iidx];
1202         /* Update potential energies */
1203         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1204
1205         /* Increment number of inner iterations */
1206         inneriter                  += j_index_end - j_index_start;
1207
1208         /* Outer loop uses 19 flops */
1209     }
1210
1211     /* Increment number of outer iterations */
1212     outeriter        += nri;
1213
1214     /* Update outer/inner flops */
1215
1216     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*423);
1217 }
1218 /*
1219  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_128_fma_single
1220  * Electrostatics interaction: CubicSplineTable
1221  * VdW interaction:            None
1222  * Geometry:                   Water4-Water4
1223  * Calculate force/pot:        Force
1224  */
1225 void
1226 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_128_fma_single
1227                     (t_nblist                    * gmx_restrict       nlist,
1228                      rvec                        * gmx_restrict          xx,
1229                      rvec                        * gmx_restrict          ff,
1230                      t_forcerec                  * gmx_restrict          fr,
1231                      t_mdatoms                   * gmx_restrict     mdatoms,
1232                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1233                      t_nrnb                      * gmx_restrict        nrnb)
1234 {
1235     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1236      * just 0 for non-waters.
1237      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1238      * jnr indices corresponding to data put in the four positions in the SIMD register.
1239      */
1240     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1241     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1242     int              jnrA,jnrB,jnrC,jnrD;
1243     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1244     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1245     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1246     real             rcutoff_scalar;
1247     real             *shiftvec,*fshift,*x,*f;
1248     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1249     real             scratch[4*DIM];
1250     __m128           fscal,rcutoff,rcutoff2,jidxall;
1251     int              vdwioffset1;
1252     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1253     int              vdwioffset2;
1254     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1255     int              vdwioffset3;
1256     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1257     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1258     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1259     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1260     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1261     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1262     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1263     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1264     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1265     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1266     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1267     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1268     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1269     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1270     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1271     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1272     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1273     real             *charge;
1274     __m128i          vfitab;
1275     __m128i          ifour       = _mm_set1_epi32(4);
1276     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1277     real             *vftab;
1278     __m128           dummy_mask,cutoff_mask;
1279     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1280     __m128           one     = _mm_set1_ps(1.0);
1281     __m128           two     = _mm_set1_ps(2.0);
1282     x                = xx[0];
1283     f                = ff[0];
1284
1285     nri              = nlist->nri;
1286     iinr             = nlist->iinr;
1287     jindex           = nlist->jindex;
1288     jjnr             = nlist->jjnr;
1289     shiftidx         = nlist->shift;
1290     gid              = nlist->gid;
1291     shiftvec         = fr->shift_vec[0];
1292     fshift           = fr->fshift[0];
1293     facel            = _mm_set1_ps(fr->epsfac);
1294     charge           = mdatoms->chargeA;
1295
1296     vftab            = kernel_data->table_elec->data;
1297     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
1298
1299     /* Setup water-specific parameters */
1300     inr              = nlist->iinr[0];
1301     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1302     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1303     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1304
1305     jq1              = _mm_set1_ps(charge[inr+1]);
1306     jq2              = _mm_set1_ps(charge[inr+2]);
1307     jq3              = _mm_set1_ps(charge[inr+3]);
1308     qq11             = _mm_mul_ps(iq1,jq1);
1309     qq12             = _mm_mul_ps(iq1,jq2);
1310     qq13             = _mm_mul_ps(iq1,jq3);
1311     qq21             = _mm_mul_ps(iq2,jq1);
1312     qq22             = _mm_mul_ps(iq2,jq2);
1313     qq23             = _mm_mul_ps(iq2,jq3);
1314     qq31             = _mm_mul_ps(iq3,jq1);
1315     qq32             = _mm_mul_ps(iq3,jq2);
1316     qq33             = _mm_mul_ps(iq3,jq3);
1317
1318     /* Avoid stupid compiler warnings */
1319     jnrA = jnrB = jnrC = jnrD = 0;
1320     j_coord_offsetA = 0;
1321     j_coord_offsetB = 0;
1322     j_coord_offsetC = 0;
1323     j_coord_offsetD = 0;
1324
1325     outeriter        = 0;
1326     inneriter        = 0;
1327
1328     for(iidx=0;iidx<4*DIM;iidx++)
1329     {
1330         scratch[iidx] = 0.0;
1331     }
1332
1333     /* Start outer loop over neighborlists */
1334     for(iidx=0; iidx<nri; iidx++)
1335     {
1336         /* Load shift vector for this list */
1337         i_shift_offset   = DIM*shiftidx[iidx];
1338
1339         /* Load limits for loop over neighbors */
1340         j_index_start    = jindex[iidx];
1341         j_index_end      = jindex[iidx+1];
1342
1343         /* Get outer coordinate index */
1344         inr              = iinr[iidx];
1345         i_coord_offset   = DIM*inr;
1346
1347         /* Load i particle coords and add shift vector */
1348         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1349                                                  &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1350
1351         fix1             = _mm_setzero_ps();
1352         fiy1             = _mm_setzero_ps();
1353         fiz1             = _mm_setzero_ps();
1354         fix2             = _mm_setzero_ps();
1355         fiy2             = _mm_setzero_ps();
1356         fiz2             = _mm_setzero_ps();
1357         fix3             = _mm_setzero_ps();
1358         fiy3             = _mm_setzero_ps();
1359         fiz3             = _mm_setzero_ps();
1360
1361         /* Start inner kernel loop */
1362         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1363         {
1364
1365             /* Get j neighbor index, and coordinate index */
1366             jnrA             = jjnr[jidx];
1367             jnrB             = jjnr[jidx+1];
1368             jnrC             = jjnr[jidx+2];
1369             jnrD             = jjnr[jidx+3];
1370             j_coord_offsetA  = DIM*jnrA;
1371             j_coord_offsetB  = DIM*jnrB;
1372             j_coord_offsetC  = DIM*jnrC;
1373             j_coord_offsetD  = DIM*jnrD;
1374
1375             /* load j atom coordinates */
1376             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1377                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1378                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1379
1380             /* Calculate displacement vector */
1381             dx11             = _mm_sub_ps(ix1,jx1);
1382             dy11             = _mm_sub_ps(iy1,jy1);
1383             dz11             = _mm_sub_ps(iz1,jz1);
1384             dx12             = _mm_sub_ps(ix1,jx2);
1385             dy12             = _mm_sub_ps(iy1,jy2);
1386             dz12             = _mm_sub_ps(iz1,jz2);
1387             dx13             = _mm_sub_ps(ix1,jx3);
1388             dy13             = _mm_sub_ps(iy1,jy3);
1389             dz13             = _mm_sub_ps(iz1,jz3);
1390             dx21             = _mm_sub_ps(ix2,jx1);
1391             dy21             = _mm_sub_ps(iy2,jy1);
1392             dz21             = _mm_sub_ps(iz2,jz1);
1393             dx22             = _mm_sub_ps(ix2,jx2);
1394             dy22             = _mm_sub_ps(iy2,jy2);
1395             dz22             = _mm_sub_ps(iz2,jz2);
1396             dx23             = _mm_sub_ps(ix2,jx3);
1397             dy23             = _mm_sub_ps(iy2,jy3);
1398             dz23             = _mm_sub_ps(iz2,jz3);
1399             dx31             = _mm_sub_ps(ix3,jx1);
1400             dy31             = _mm_sub_ps(iy3,jy1);
1401             dz31             = _mm_sub_ps(iz3,jz1);
1402             dx32             = _mm_sub_ps(ix3,jx2);
1403             dy32             = _mm_sub_ps(iy3,jy2);
1404             dz32             = _mm_sub_ps(iz3,jz2);
1405             dx33             = _mm_sub_ps(ix3,jx3);
1406             dy33             = _mm_sub_ps(iy3,jy3);
1407             dz33             = _mm_sub_ps(iz3,jz3);
1408
1409             /* Calculate squared distance and things based on it */
1410             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1411             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1412             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1413             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1414             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1415             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1416             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1417             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1418             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1419
1420             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1421             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1422             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1423             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1424             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1425             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1426             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1427             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1428             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1429
1430             fjx1             = _mm_setzero_ps();
1431             fjy1             = _mm_setzero_ps();
1432             fjz1             = _mm_setzero_ps();
1433             fjx2             = _mm_setzero_ps();
1434             fjy2             = _mm_setzero_ps();
1435             fjz2             = _mm_setzero_ps();
1436             fjx3             = _mm_setzero_ps();
1437             fjy3             = _mm_setzero_ps();
1438             fjz3             = _mm_setzero_ps();
1439
1440             /**************************
1441              * CALCULATE INTERACTIONS *
1442              **************************/
1443
1444             r11              = _mm_mul_ps(rsq11,rinv11);
1445
1446             /* Calculate table index by multiplying r with table scale and truncate to integer */
1447             rt               = _mm_mul_ps(r11,vftabscale);
1448             vfitab           = _mm_cvttps_epi32(rt);
1449 #ifdef __XOP__
1450             vfeps            = _mm_frcz_ps(rt);
1451 #else
1452             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1453 #endif
1454             twovfeps         = _mm_add_ps(vfeps,vfeps);
1455             vfitab           = _mm_slli_epi32(vfitab,2);
1456
1457             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1458             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1459             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1460             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1461             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1462             _MM_TRANSPOSE4_PS(Y,F,G,H);
1463             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1464             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1465             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1466
1467             fscal            = felec;
1468
1469              /* Update vectorial force */
1470             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1471             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1472             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1473
1474             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1475             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1476             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1477
1478             /**************************
1479              * CALCULATE INTERACTIONS *
1480              **************************/
1481
1482             r12              = _mm_mul_ps(rsq12,rinv12);
1483
1484             /* Calculate table index by multiplying r with table scale and truncate to integer */
1485             rt               = _mm_mul_ps(r12,vftabscale);
1486             vfitab           = _mm_cvttps_epi32(rt);
1487 #ifdef __XOP__
1488             vfeps            = _mm_frcz_ps(rt);
1489 #else
1490             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1491 #endif
1492             twovfeps         = _mm_add_ps(vfeps,vfeps);
1493             vfitab           = _mm_slli_epi32(vfitab,2);
1494
1495             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1496             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1497             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1498             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1499             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1500             _MM_TRANSPOSE4_PS(Y,F,G,H);
1501             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1502             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1503             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1504
1505             fscal            = felec;
1506
1507              /* Update vectorial force */
1508             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1509             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1510             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1511
1512             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1513             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1514             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1515
1516             /**************************
1517              * CALCULATE INTERACTIONS *
1518              **************************/
1519
1520             r13              = _mm_mul_ps(rsq13,rinv13);
1521
1522             /* Calculate table index by multiplying r with table scale and truncate to integer */
1523             rt               = _mm_mul_ps(r13,vftabscale);
1524             vfitab           = _mm_cvttps_epi32(rt);
1525 #ifdef __XOP__
1526             vfeps            = _mm_frcz_ps(rt);
1527 #else
1528             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1529 #endif
1530             twovfeps         = _mm_add_ps(vfeps,vfeps);
1531             vfitab           = _mm_slli_epi32(vfitab,2);
1532
1533             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1534             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1535             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1536             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1537             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1538             _MM_TRANSPOSE4_PS(Y,F,G,H);
1539             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1540             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1541             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1542
1543             fscal            = felec;
1544
1545              /* Update vectorial force */
1546             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1547             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1548             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1549
1550             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1551             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1552             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1553
1554             /**************************
1555              * CALCULATE INTERACTIONS *
1556              **************************/
1557
1558             r21              = _mm_mul_ps(rsq21,rinv21);
1559
1560             /* Calculate table index by multiplying r with table scale and truncate to integer */
1561             rt               = _mm_mul_ps(r21,vftabscale);
1562             vfitab           = _mm_cvttps_epi32(rt);
1563 #ifdef __XOP__
1564             vfeps            = _mm_frcz_ps(rt);
1565 #else
1566             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1567 #endif
1568             twovfeps         = _mm_add_ps(vfeps,vfeps);
1569             vfitab           = _mm_slli_epi32(vfitab,2);
1570
1571             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1572             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1573             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1574             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1575             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1576             _MM_TRANSPOSE4_PS(Y,F,G,H);
1577             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1578             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1579             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1580
1581             fscal            = felec;
1582
1583              /* Update vectorial force */
1584             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1585             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1586             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1587
1588             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1589             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1590             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1591
1592             /**************************
1593              * CALCULATE INTERACTIONS *
1594              **************************/
1595
1596             r22              = _mm_mul_ps(rsq22,rinv22);
1597
1598             /* Calculate table index by multiplying r with table scale and truncate to integer */
1599             rt               = _mm_mul_ps(r22,vftabscale);
1600             vfitab           = _mm_cvttps_epi32(rt);
1601 #ifdef __XOP__
1602             vfeps            = _mm_frcz_ps(rt);
1603 #else
1604             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1605 #endif
1606             twovfeps         = _mm_add_ps(vfeps,vfeps);
1607             vfitab           = _mm_slli_epi32(vfitab,2);
1608
1609             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1610             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1611             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1612             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1613             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1614             _MM_TRANSPOSE4_PS(Y,F,G,H);
1615             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1616             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1617             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1618
1619             fscal            = felec;
1620
1621              /* Update vectorial force */
1622             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1623             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1624             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1625
1626             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1627             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1628             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1629
1630             /**************************
1631              * CALCULATE INTERACTIONS *
1632              **************************/
1633
1634             r23              = _mm_mul_ps(rsq23,rinv23);
1635
1636             /* Calculate table index by multiplying r with table scale and truncate to integer */
1637             rt               = _mm_mul_ps(r23,vftabscale);
1638             vfitab           = _mm_cvttps_epi32(rt);
1639 #ifdef __XOP__
1640             vfeps            = _mm_frcz_ps(rt);
1641 #else
1642             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1643 #endif
1644             twovfeps         = _mm_add_ps(vfeps,vfeps);
1645             vfitab           = _mm_slli_epi32(vfitab,2);
1646
1647             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1648             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1649             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1650             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1651             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1652             _MM_TRANSPOSE4_PS(Y,F,G,H);
1653             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1654             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1655             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
1656
1657             fscal            = felec;
1658
1659              /* Update vectorial force */
1660             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1661             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1662             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1663
1664             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1665             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1666             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1667
1668             /**************************
1669              * CALCULATE INTERACTIONS *
1670              **************************/
1671
1672             r31              = _mm_mul_ps(rsq31,rinv31);
1673
1674             /* Calculate table index by multiplying r with table scale and truncate to integer */
1675             rt               = _mm_mul_ps(r31,vftabscale);
1676             vfitab           = _mm_cvttps_epi32(rt);
1677 #ifdef __XOP__
1678             vfeps            = _mm_frcz_ps(rt);
1679 #else
1680             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1681 #endif
1682             twovfeps         = _mm_add_ps(vfeps,vfeps);
1683             vfitab           = _mm_slli_epi32(vfitab,2);
1684
1685             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1686             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1687             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1688             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1689             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1690             _MM_TRANSPOSE4_PS(Y,F,G,H);
1691             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1692             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1693             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
1694
1695             fscal            = felec;
1696
1697              /* Update vectorial force */
1698             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1699             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1700             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1701
1702             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1703             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1704             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1705
1706             /**************************
1707              * CALCULATE INTERACTIONS *
1708              **************************/
1709
1710             r32              = _mm_mul_ps(rsq32,rinv32);
1711
1712             /* Calculate table index by multiplying r with table scale and truncate to integer */
1713             rt               = _mm_mul_ps(r32,vftabscale);
1714             vfitab           = _mm_cvttps_epi32(rt);
1715 #ifdef __XOP__
1716             vfeps            = _mm_frcz_ps(rt);
1717 #else
1718             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1719 #endif
1720             twovfeps         = _mm_add_ps(vfeps,vfeps);
1721             vfitab           = _mm_slli_epi32(vfitab,2);
1722
1723             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1724             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1725             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1726             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1727             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1728             _MM_TRANSPOSE4_PS(Y,F,G,H);
1729             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1730             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1731             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
1732
1733             fscal            = felec;
1734
1735              /* Update vectorial force */
1736             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1737             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1738             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1739
1740             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1741             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1742             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1743
1744             /**************************
1745              * CALCULATE INTERACTIONS *
1746              **************************/
1747
1748             r33              = _mm_mul_ps(rsq33,rinv33);
1749
1750             /* Calculate table index by multiplying r with table scale and truncate to integer */
1751             rt               = _mm_mul_ps(r33,vftabscale);
1752             vfitab           = _mm_cvttps_epi32(rt);
1753 #ifdef __XOP__
1754             vfeps            = _mm_frcz_ps(rt);
1755 #else
1756             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1757 #endif
1758             twovfeps         = _mm_add_ps(vfeps,vfeps);
1759             vfitab           = _mm_slli_epi32(vfitab,2);
1760
1761             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1762             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1763             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1764             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1765             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1766             _MM_TRANSPOSE4_PS(Y,F,G,H);
1767             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1768             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1769             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
1770
1771             fscal            = felec;
1772
1773              /* Update vectorial force */
1774             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1775             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1776             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1777
1778             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1779             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1780             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1781
1782             fjptrA             = f+j_coord_offsetA;
1783             fjptrB             = f+j_coord_offsetB;
1784             fjptrC             = f+j_coord_offsetC;
1785             fjptrD             = f+j_coord_offsetD;
1786
1787             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1788                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1789
1790             /* Inner loop uses 378 flops */
1791         }
1792
1793         if(jidx<j_index_end)
1794         {
1795
1796             /* Get j neighbor index, and coordinate index */
1797             jnrlistA         = jjnr[jidx];
1798             jnrlistB         = jjnr[jidx+1];
1799             jnrlistC         = jjnr[jidx+2];
1800             jnrlistD         = jjnr[jidx+3];
1801             /* Sign of each element will be negative for non-real atoms.
1802              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1803              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1804              */
1805             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1806             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1807             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1808             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1809             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1810             j_coord_offsetA  = DIM*jnrA;
1811             j_coord_offsetB  = DIM*jnrB;
1812             j_coord_offsetC  = DIM*jnrC;
1813             j_coord_offsetD  = DIM*jnrD;
1814
1815             /* load j atom coordinates */
1816             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1817                                               x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1818                                               &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1819
1820             /* Calculate displacement vector */
1821             dx11             = _mm_sub_ps(ix1,jx1);
1822             dy11             = _mm_sub_ps(iy1,jy1);
1823             dz11             = _mm_sub_ps(iz1,jz1);
1824             dx12             = _mm_sub_ps(ix1,jx2);
1825             dy12             = _mm_sub_ps(iy1,jy2);
1826             dz12             = _mm_sub_ps(iz1,jz2);
1827             dx13             = _mm_sub_ps(ix1,jx3);
1828             dy13             = _mm_sub_ps(iy1,jy3);
1829             dz13             = _mm_sub_ps(iz1,jz3);
1830             dx21             = _mm_sub_ps(ix2,jx1);
1831             dy21             = _mm_sub_ps(iy2,jy1);
1832             dz21             = _mm_sub_ps(iz2,jz1);
1833             dx22             = _mm_sub_ps(ix2,jx2);
1834             dy22             = _mm_sub_ps(iy2,jy2);
1835             dz22             = _mm_sub_ps(iz2,jz2);
1836             dx23             = _mm_sub_ps(ix2,jx3);
1837             dy23             = _mm_sub_ps(iy2,jy3);
1838             dz23             = _mm_sub_ps(iz2,jz3);
1839             dx31             = _mm_sub_ps(ix3,jx1);
1840             dy31             = _mm_sub_ps(iy3,jy1);
1841             dz31             = _mm_sub_ps(iz3,jz1);
1842             dx32             = _mm_sub_ps(ix3,jx2);
1843             dy32             = _mm_sub_ps(iy3,jy2);
1844             dz32             = _mm_sub_ps(iz3,jz2);
1845             dx33             = _mm_sub_ps(ix3,jx3);
1846             dy33             = _mm_sub_ps(iy3,jy3);
1847             dz33             = _mm_sub_ps(iz3,jz3);
1848
1849             /* Calculate squared distance and things based on it */
1850             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1851             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1852             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1853             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1854             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1855             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1856             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1857             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1858             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1859
1860             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1861             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1862             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1863             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1864             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1865             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1866             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1867             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1868             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1869
1870             fjx1             = _mm_setzero_ps();
1871             fjy1             = _mm_setzero_ps();
1872             fjz1             = _mm_setzero_ps();
1873             fjx2             = _mm_setzero_ps();
1874             fjy2             = _mm_setzero_ps();
1875             fjz2             = _mm_setzero_ps();
1876             fjx3             = _mm_setzero_ps();
1877             fjy3             = _mm_setzero_ps();
1878             fjz3             = _mm_setzero_ps();
1879
1880             /**************************
1881              * CALCULATE INTERACTIONS *
1882              **************************/
1883
1884             r11              = _mm_mul_ps(rsq11,rinv11);
1885             r11              = _mm_andnot_ps(dummy_mask,r11);
1886
1887             /* Calculate table index by multiplying r with table scale and truncate to integer */
1888             rt               = _mm_mul_ps(r11,vftabscale);
1889             vfitab           = _mm_cvttps_epi32(rt);
1890 #ifdef __XOP__
1891             vfeps            = _mm_frcz_ps(rt);
1892 #else
1893             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1894 #endif
1895             twovfeps         = _mm_add_ps(vfeps,vfeps);
1896             vfitab           = _mm_slli_epi32(vfitab,2);
1897
1898             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1899             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1900             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1901             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1902             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1903             _MM_TRANSPOSE4_PS(Y,F,G,H);
1904             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1905             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1906             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1907
1908             fscal            = felec;
1909
1910             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1911
1912              /* Update vectorial force */
1913             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1914             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1915             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1916
1917             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1918             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1919             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1920
1921             /**************************
1922              * CALCULATE INTERACTIONS *
1923              **************************/
1924
1925             r12              = _mm_mul_ps(rsq12,rinv12);
1926             r12              = _mm_andnot_ps(dummy_mask,r12);
1927
1928             /* Calculate table index by multiplying r with table scale and truncate to integer */
1929             rt               = _mm_mul_ps(r12,vftabscale);
1930             vfitab           = _mm_cvttps_epi32(rt);
1931 #ifdef __XOP__
1932             vfeps            = _mm_frcz_ps(rt);
1933 #else
1934             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1935 #endif
1936             twovfeps         = _mm_add_ps(vfeps,vfeps);
1937             vfitab           = _mm_slli_epi32(vfitab,2);
1938
1939             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1940             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1941             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1942             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1943             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1944             _MM_TRANSPOSE4_PS(Y,F,G,H);
1945             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1946             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1947             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1948
1949             fscal            = felec;
1950
1951             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1952
1953              /* Update vectorial force */
1954             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1955             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1956             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1957
1958             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1959             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1960             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1961
1962             /**************************
1963              * CALCULATE INTERACTIONS *
1964              **************************/
1965
1966             r13              = _mm_mul_ps(rsq13,rinv13);
1967             r13              = _mm_andnot_ps(dummy_mask,r13);
1968
1969             /* Calculate table index by multiplying r with table scale and truncate to integer */
1970             rt               = _mm_mul_ps(r13,vftabscale);
1971             vfitab           = _mm_cvttps_epi32(rt);
1972 #ifdef __XOP__
1973             vfeps            = _mm_frcz_ps(rt);
1974 #else
1975             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1976 #endif
1977             twovfeps         = _mm_add_ps(vfeps,vfeps);
1978             vfitab           = _mm_slli_epi32(vfitab,2);
1979
1980             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1981             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1982             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1983             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1984             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1985             _MM_TRANSPOSE4_PS(Y,F,G,H);
1986             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1987             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1988             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq13,FF),_mm_mul_ps(vftabscale,rinv13)));
1989
1990             fscal            = felec;
1991
1992             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1993
1994              /* Update vectorial force */
1995             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1996             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1997             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1998
1999             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
2000             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
2001             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
2002
2003             /**************************
2004              * CALCULATE INTERACTIONS *
2005              **************************/
2006
2007             r21              = _mm_mul_ps(rsq21,rinv21);
2008             r21              = _mm_andnot_ps(dummy_mask,r21);
2009
2010             /* Calculate table index by multiplying r with table scale and truncate to integer */
2011             rt               = _mm_mul_ps(r21,vftabscale);
2012             vfitab           = _mm_cvttps_epi32(rt);
2013 #ifdef __XOP__
2014             vfeps            = _mm_frcz_ps(rt);
2015 #else
2016             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2017 #endif
2018             twovfeps         = _mm_add_ps(vfeps,vfeps);
2019             vfitab           = _mm_slli_epi32(vfitab,2);
2020
2021             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2022             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2023             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2024             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2025             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2026             _MM_TRANSPOSE4_PS(Y,F,G,H);
2027             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2028             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2029             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2030
2031             fscal            = felec;
2032
2033             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2034
2035              /* Update vectorial force */
2036             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2037             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2038             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2039
2040             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2041             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2042             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2043
2044             /**************************
2045              * CALCULATE INTERACTIONS *
2046              **************************/
2047
2048             r22              = _mm_mul_ps(rsq22,rinv22);
2049             r22              = _mm_andnot_ps(dummy_mask,r22);
2050
2051             /* Calculate table index by multiplying r with table scale and truncate to integer */
2052             rt               = _mm_mul_ps(r22,vftabscale);
2053             vfitab           = _mm_cvttps_epi32(rt);
2054 #ifdef __XOP__
2055             vfeps            = _mm_frcz_ps(rt);
2056 #else
2057             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2058 #endif
2059             twovfeps         = _mm_add_ps(vfeps,vfeps);
2060             vfitab           = _mm_slli_epi32(vfitab,2);
2061
2062             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2063             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2064             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2065             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2066             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2067             _MM_TRANSPOSE4_PS(Y,F,G,H);
2068             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2069             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2070             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2071
2072             fscal            = felec;
2073
2074             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2075
2076              /* Update vectorial force */
2077             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2078             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2079             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2080
2081             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2082             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2083             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2084
2085             /**************************
2086              * CALCULATE INTERACTIONS *
2087              **************************/
2088
2089             r23              = _mm_mul_ps(rsq23,rinv23);
2090             r23              = _mm_andnot_ps(dummy_mask,r23);
2091
2092             /* Calculate table index by multiplying r with table scale and truncate to integer */
2093             rt               = _mm_mul_ps(r23,vftabscale);
2094             vfitab           = _mm_cvttps_epi32(rt);
2095 #ifdef __XOP__
2096             vfeps            = _mm_frcz_ps(rt);
2097 #else
2098             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2099 #endif
2100             twovfeps         = _mm_add_ps(vfeps,vfeps);
2101             vfitab           = _mm_slli_epi32(vfitab,2);
2102
2103             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2104             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2105             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2106             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2107             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2108             _MM_TRANSPOSE4_PS(Y,F,G,H);
2109             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2110             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2111             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq23,FF),_mm_mul_ps(vftabscale,rinv23)));
2112
2113             fscal            = felec;
2114
2115             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2116
2117              /* Update vectorial force */
2118             fix2             = _mm_macc_ps(dx23,fscal,fix2);
2119             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
2120             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
2121
2122             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
2123             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
2124             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
2125
2126             /**************************
2127              * CALCULATE INTERACTIONS *
2128              **************************/
2129
2130             r31              = _mm_mul_ps(rsq31,rinv31);
2131             r31              = _mm_andnot_ps(dummy_mask,r31);
2132
2133             /* Calculate table index by multiplying r with table scale and truncate to integer */
2134             rt               = _mm_mul_ps(r31,vftabscale);
2135             vfitab           = _mm_cvttps_epi32(rt);
2136 #ifdef __XOP__
2137             vfeps            = _mm_frcz_ps(rt);
2138 #else
2139             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2140 #endif
2141             twovfeps         = _mm_add_ps(vfeps,vfeps);
2142             vfitab           = _mm_slli_epi32(vfitab,2);
2143
2144             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2145             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2146             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2147             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2148             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2149             _MM_TRANSPOSE4_PS(Y,F,G,H);
2150             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2151             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2152             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq31,FF),_mm_mul_ps(vftabscale,rinv31)));
2153
2154             fscal            = felec;
2155
2156             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2157
2158              /* Update vectorial force */
2159             fix3             = _mm_macc_ps(dx31,fscal,fix3);
2160             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
2161             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
2162
2163             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
2164             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
2165             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
2166
2167             /**************************
2168              * CALCULATE INTERACTIONS *
2169              **************************/
2170
2171             r32              = _mm_mul_ps(rsq32,rinv32);
2172             r32              = _mm_andnot_ps(dummy_mask,r32);
2173
2174             /* Calculate table index by multiplying r with table scale and truncate to integer */
2175             rt               = _mm_mul_ps(r32,vftabscale);
2176             vfitab           = _mm_cvttps_epi32(rt);
2177 #ifdef __XOP__
2178             vfeps            = _mm_frcz_ps(rt);
2179 #else
2180             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2181 #endif
2182             twovfeps         = _mm_add_ps(vfeps,vfeps);
2183             vfitab           = _mm_slli_epi32(vfitab,2);
2184
2185             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2186             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2187             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2188             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2189             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2190             _MM_TRANSPOSE4_PS(Y,F,G,H);
2191             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2192             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2193             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq32,FF),_mm_mul_ps(vftabscale,rinv32)));
2194
2195             fscal            = felec;
2196
2197             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2198
2199              /* Update vectorial force */
2200             fix3             = _mm_macc_ps(dx32,fscal,fix3);
2201             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
2202             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
2203
2204             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
2205             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
2206             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
2207
2208             /**************************
2209              * CALCULATE INTERACTIONS *
2210              **************************/
2211
2212             r33              = _mm_mul_ps(rsq33,rinv33);
2213             r33              = _mm_andnot_ps(dummy_mask,r33);
2214
2215             /* Calculate table index by multiplying r with table scale and truncate to integer */
2216             rt               = _mm_mul_ps(r33,vftabscale);
2217             vfitab           = _mm_cvttps_epi32(rt);
2218 #ifdef __XOP__
2219             vfeps            = _mm_frcz_ps(rt);
2220 #else
2221             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
2222 #endif
2223             twovfeps         = _mm_add_ps(vfeps,vfeps);
2224             vfitab           = _mm_slli_epi32(vfitab,2);
2225
2226             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2227             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
2228             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
2229             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
2230             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
2231             _MM_TRANSPOSE4_PS(Y,F,G,H);
2232             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
2233             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
2234             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq33,FF),_mm_mul_ps(vftabscale,rinv33)));
2235
2236             fscal            = felec;
2237
2238             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2239
2240              /* Update vectorial force */
2241             fix3             = _mm_macc_ps(dx33,fscal,fix3);
2242             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
2243             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
2244
2245             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
2246             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
2247             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
2248
2249             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2250             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2251             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2252             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2253
2254             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2255                                                    fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2256
2257             /* Inner loop uses 387 flops */
2258         }
2259
2260         /* End of innermost loop */
2261
2262         gmx_mm_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2263                                               f+i_coord_offset+DIM,fshift+i_shift_offset);
2264
2265         /* Increment number of inner iterations */
2266         inneriter                  += j_index_end - j_index_start;
2267
2268         /* Outer loop uses 18 flops */
2269     }
2270
2271     /* Increment number of outer iterations */
2272     outeriter        += nri;
2273
2274     /* Update outer/inner flops */
2275
2276     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*387);
2277 }