Merge release-5-0 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEw_VdwCSTab_GeomW4W4_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
50 #include "kernelutil_x86_avx_128_fma_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_128_fma_single
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            CubicSplineTable
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_128_fma_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset0;
86     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     int              vdwioffset1;
88     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     int              vdwioffset2;
90     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwioffset3;
92     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
96     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
97     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
98     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
99     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
100     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
101     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
102     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
105     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
108     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
109     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
110     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
111     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
112     real             *charge;
113     int              nvdwtype;
114     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115     int              *vdwtype;
116     real             *vdwparam;
117     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
118     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
119     __m128i          vfitab;
120     __m128i          ifour       = _mm_set1_epi32(4);
121     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
122     real             *vftab;
123     __m128i          ewitab;
124     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
125     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
126     real             *ewtab;
127     __m128           dummy_mask,cutoff_mask;
128     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
129     __m128           one     = _mm_set1_ps(1.0);
130     __m128           two     = _mm_set1_ps(2.0);
131     x                = xx[0];
132     f                = ff[0];
133
134     nri              = nlist->nri;
135     iinr             = nlist->iinr;
136     jindex           = nlist->jindex;
137     jjnr             = nlist->jjnr;
138     shiftidx         = nlist->shift;
139     gid              = nlist->gid;
140     shiftvec         = fr->shift_vec[0];
141     fshift           = fr->fshift[0];
142     facel            = _mm_set1_ps(fr->epsfac);
143     charge           = mdatoms->chargeA;
144     nvdwtype         = fr->ntype;
145     vdwparam         = fr->nbfp;
146     vdwtype          = mdatoms->typeA;
147
148     vftab            = kernel_data->table_vdw->data;
149     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
150
151     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
152     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
153     beta2            = _mm_mul_ps(beta,beta);
154     beta3            = _mm_mul_ps(beta,beta2);
155     ewtab            = fr->ic->tabq_coul_FDV0;
156     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
157     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
158
159     /* Setup water-specific parameters */
160     inr              = nlist->iinr[0];
161     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
162     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
163     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
164     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
165
166     jq1              = _mm_set1_ps(charge[inr+1]);
167     jq2              = _mm_set1_ps(charge[inr+2]);
168     jq3              = _mm_set1_ps(charge[inr+3]);
169     vdwjidx0A        = 2*vdwtype[inr+0];
170     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
171     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
172     qq11             = _mm_mul_ps(iq1,jq1);
173     qq12             = _mm_mul_ps(iq1,jq2);
174     qq13             = _mm_mul_ps(iq1,jq3);
175     qq21             = _mm_mul_ps(iq2,jq1);
176     qq22             = _mm_mul_ps(iq2,jq2);
177     qq23             = _mm_mul_ps(iq2,jq3);
178     qq31             = _mm_mul_ps(iq3,jq1);
179     qq32             = _mm_mul_ps(iq3,jq2);
180     qq33             = _mm_mul_ps(iq3,jq3);
181
182     /* Avoid stupid compiler warnings */
183     jnrA = jnrB = jnrC = jnrD = 0;
184     j_coord_offsetA = 0;
185     j_coord_offsetB = 0;
186     j_coord_offsetC = 0;
187     j_coord_offsetD = 0;
188
189     outeriter        = 0;
190     inneriter        = 0;
191
192     for(iidx=0;iidx<4*DIM;iidx++)
193     {
194         scratch[iidx] = 0.0;
195     }
196
197     /* Start outer loop over neighborlists */
198     for(iidx=0; iidx<nri; iidx++)
199     {
200         /* Load shift vector for this list */
201         i_shift_offset   = DIM*shiftidx[iidx];
202
203         /* Load limits for loop over neighbors */
204         j_index_start    = jindex[iidx];
205         j_index_end      = jindex[iidx+1];
206
207         /* Get outer coordinate index */
208         inr              = iinr[iidx];
209         i_coord_offset   = DIM*inr;
210
211         /* Load i particle coords and add shift vector */
212         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
213                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
214
215         fix0             = _mm_setzero_ps();
216         fiy0             = _mm_setzero_ps();
217         fiz0             = _mm_setzero_ps();
218         fix1             = _mm_setzero_ps();
219         fiy1             = _mm_setzero_ps();
220         fiz1             = _mm_setzero_ps();
221         fix2             = _mm_setzero_ps();
222         fiy2             = _mm_setzero_ps();
223         fiz2             = _mm_setzero_ps();
224         fix3             = _mm_setzero_ps();
225         fiy3             = _mm_setzero_ps();
226         fiz3             = _mm_setzero_ps();
227
228         /* Reset potential sums */
229         velecsum         = _mm_setzero_ps();
230         vvdwsum          = _mm_setzero_ps();
231
232         /* Start inner kernel loop */
233         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
234         {
235
236             /* Get j neighbor index, and coordinate index */
237             jnrA             = jjnr[jidx];
238             jnrB             = jjnr[jidx+1];
239             jnrC             = jjnr[jidx+2];
240             jnrD             = jjnr[jidx+3];
241             j_coord_offsetA  = DIM*jnrA;
242             j_coord_offsetB  = DIM*jnrB;
243             j_coord_offsetC  = DIM*jnrC;
244             j_coord_offsetD  = DIM*jnrD;
245
246             /* load j atom coordinates */
247             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
248                                               x+j_coord_offsetC,x+j_coord_offsetD,
249                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
250                                               &jy2,&jz2,&jx3,&jy3,&jz3);
251
252             /* Calculate displacement vector */
253             dx00             = _mm_sub_ps(ix0,jx0);
254             dy00             = _mm_sub_ps(iy0,jy0);
255             dz00             = _mm_sub_ps(iz0,jz0);
256             dx11             = _mm_sub_ps(ix1,jx1);
257             dy11             = _mm_sub_ps(iy1,jy1);
258             dz11             = _mm_sub_ps(iz1,jz1);
259             dx12             = _mm_sub_ps(ix1,jx2);
260             dy12             = _mm_sub_ps(iy1,jy2);
261             dz12             = _mm_sub_ps(iz1,jz2);
262             dx13             = _mm_sub_ps(ix1,jx3);
263             dy13             = _mm_sub_ps(iy1,jy3);
264             dz13             = _mm_sub_ps(iz1,jz3);
265             dx21             = _mm_sub_ps(ix2,jx1);
266             dy21             = _mm_sub_ps(iy2,jy1);
267             dz21             = _mm_sub_ps(iz2,jz1);
268             dx22             = _mm_sub_ps(ix2,jx2);
269             dy22             = _mm_sub_ps(iy2,jy2);
270             dz22             = _mm_sub_ps(iz2,jz2);
271             dx23             = _mm_sub_ps(ix2,jx3);
272             dy23             = _mm_sub_ps(iy2,jy3);
273             dz23             = _mm_sub_ps(iz2,jz3);
274             dx31             = _mm_sub_ps(ix3,jx1);
275             dy31             = _mm_sub_ps(iy3,jy1);
276             dz31             = _mm_sub_ps(iz3,jz1);
277             dx32             = _mm_sub_ps(ix3,jx2);
278             dy32             = _mm_sub_ps(iy3,jy2);
279             dz32             = _mm_sub_ps(iz3,jz2);
280             dx33             = _mm_sub_ps(ix3,jx3);
281             dy33             = _mm_sub_ps(iy3,jy3);
282             dz33             = _mm_sub_ps(iz3,jz3);
283
284             /* Calculate squared distance and things based on it */
285             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
286             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
287             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
288             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
289             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
290             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
291             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
292             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
293             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
294             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
295
296             rinv00           = gmx_mm_invsqrt_ps(rsq00);
297             rinv11           = gmx_mm_invsqrt_ps(rsq11);
298             rinv12           = gmx_mm_invsqrt_ps(rsq12);
299             rinv13           = gmx_mm_invsqrt_ps(rsq13);
300             rinv21           = gmx_mm_invsqrt_ps(rsq21);
301             rinv22           = gmx_mm_invsqrt_ps(rsq22);
302             rinv23           = gmx_mm_invsqrt_ps(rsq23);
303             rinv31           = gmx_mm_invsqrt_ps(rsq31);
304             rinv32           = gmx_mm_invsqrt_ps(rsq32);
305             rinv33           = gmx_mm_invsqrt_ps(rsq33);
306
307             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
308             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
309             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
310             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
311             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
312             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
313             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
314             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
315             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
316
317             fjx0             = _mm_setzero_ps();
318             fjy0             = _mm_setzero_ps();
319             fjz0             = _mm_setzero_ps();
320             fjx1             = _mm_setzero_ps();
321             fjy1             = _mm_setzero_ps();
322             fjz1             = _mm_setzero_ps();
323             fjx2             = _mm_setzero_ps();
324             fjy2             = _mm_setzero_ps();
325             fjz2             = _mm_setzero_ps();
326             fjx3             = _mm_setzero_ps();
327             fjy3             = _mm_setzero_ps();
328             fjz3             = _mm_setzero_ps();
329
330             /**************************
331              * CALCULATE INTERACTIONS *
332              **************************/
333
334             r00              = _mm_mul_ps(rsq00,rinv00);
335
336             /* Calculate table index by multiplying r with table scale and truncate to integer */
337             rt               = _mm_mul_ps(r00,vftabscale);
338             vfitab           = _mm_cvttps_epi32(rt);
339 #ifdef __XOP__
340             vfeps            = _mm_frcz_ps(rt);
341 #else
342             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
343 #endif
344             twovfeps         = _mm_add_ps(vfeps,vfeps);
345             vfitab           = _mm_slli_epi32(vfitab,3);
346
347             /* CUBIC SPLINE TABLE DISPERSION */
348             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
349             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
350             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
351             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
352             _MM_TRANSPOSE4_PS(Y,F,G,H);
353             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
354             VV               = _mm_macc_ps(vfeps,Fp,Y);
355             vvdw6            = _mm_mul_ps(c6_00,VV);
356             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
357             fvdw6            = _mm_mul_ps(c6_00,FF);
358
359             /* CUBIC SPLINE TABLE REPULSION */
360             vfitab           = _mm_add_epi32(vfitab,ifour);
361             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
362             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
363             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
364             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
365             _MM_TRANSPOSE4_PS(Y,F,G,H);
366             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
367             VV               = _mm_macc_ps(vfeps,Fp,Y);
368             vvdw12           = _mm_mul_ps(c12_00,VV);
369             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
370             fvdw12           = _mm_mul_ps(c12_00,FF);
371             vvdw             = _mm_add_ps(vvdw12,vvdw6);
372             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
373
374             /* Update potential sum for this i atom from the interaction with this j atom. */
375             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
376
377             fscal            = fvdw;
378
379              /* Update vectorial force */
380             fix0             = _mm_macc_ps(dx00,fscal,fix0);
381             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
382             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
383
384             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
385             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
386             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
387
388             /**************************
389              * CALCULATE INTERACTIONS *
390              **************************/
391
392             r11              = _mm_mul_ps(rsq11,rinv11);
393
394             /* EWALD ELECTROSTATICS */
395
396             /* Analytical PME correction */
397             zeta2            = _mm_mul_ps(beta2,rsq11);
398             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
399             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
400             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
401             felec            = _mm_mul_ps(qq11,felec);
402             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
403             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
404             velec            = _mm_mul_ps(qq11,velec);
405
406             /* Update potential sum for this i atom from the interaction with this j atom. */
407             velecsum         = _mm_add_ps(velecsum,velec);
408
409             fscal            = felec;
410
411              /* Update vectorial force */
412             fix1             = _mm_macc_ps(dx11,fscal,fix1);
413             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
414             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
415
416             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
417             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
418             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
419
420             /**************************
421              * CALCULATE INTERACTIONS *
422              **************************/
423
424             r12              = _mm_mul_ps(rsq12,rinv12);
425
426             /* EWALD ELECTROSTATICS */
427
428             /* Analytical PME correction */
429             zeta2            = _mm_mul_ps(beta2,rsq12);
430             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
431             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
432             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
433             felec            = _mm_mul_ps(qq12,felec);
434             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
435             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
436             velec            = _mm_mul_ps(qq12,velec);
437
438             /* Update potential sum for this i atom from the interaction with this j atom. */
439             velecsum         = _mm_add_ps(velecsum,velec);
440
441             fscal            = felec;
442
443              /* Update vectorial force */
444             fix1             = _mm_macc_ps(dx12,fscal,fix1);
445             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
446             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
447
448             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
449             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
450             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
451
452             /**************************
453              * CALCULATE INTERACTIONS *
454              **************************/
455
456             r13              = _mm_mul_ps(rsq13,rinv13);
457
458             /* EWALD ELECTROSTATICS */
459
460             /* Analytical PME correction */
461             zeta2            = _mm_mul_ps(beta2,rsq13);
462             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
463             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
464             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
465             felec            = _mm_mul_ps(qq13,felec);
466             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
467             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv13);
468             velec            = _mm_mul_ps(qq13,velec);
469
470             /* Update potential sum for this i atom from the interaction with this j atom. */
471             velecsum         = _mm_add_ps(velecsum,velec);
472
473             fscal            = felec;
474
475              /* Update vectorial force */
476             fix1             = _mm_macc_ps(dx13,fscal,fix1);
477             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
478             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
479
480             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
481             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
482             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
483
484             /**************************
485              * CALCULATE INTERACTIONS *
486              **************************/
487
488             r21              = _mm_mul_ps(rsq21,rinv21);
489
490             /* EWALD ELECTROSTATICS */
491
492             /* Analytical PME correction */
493             zeta2            = _mm_mul_ps(beta2,rsq21);
494             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
495             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
496             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
497             felec            = _mm_mul_ps(qq21,felec);
498             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
499             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
500             velec            = _mm_mul_ps(qq21,velec);
501
502             /* Update potential sum for this i atom from the interaction with this j atom. */
503             velecsum         = _mm_add_ps(velecsum,velec);
504
505             fscal            = felec;
506
507              /* Update vectorial force */
508             fix2             = _mm_macc_ps(dx21,fscal,fix2);
509             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
510             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
511
512             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
513             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
514             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
515
516             /**************************
517              * CALCULATE INTERACTIONS *
518              **************************/
519
520             r22              = _mm_mul_ps(rsq22,rinv22);
521
522             /* EWALD ELECTROSTATICS */
523
524             /* Analytical PME correction */
525             zeta2            = _mm_mul_ps(beta2,rsq22);
526             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
527             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
528             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
529             felec            = _mm_mul_ps(qq22,felec);
530             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
531             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
532             velec            = _mm_mul_ps(qq22,velec);
533
534             /* Update potential sum for this i atom from the interaction with this j atom. */
535             velecsum         = _mm_add_ps(velecsum,velec);
536
537             fscal            = felec;
538
539              /* Update vectorial force */
540             fix2             = _mm_macc_ps(dx22,fscal,fix2);
541             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
542             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
543
544             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
545             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
546             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
547
548             /**************************
549              * CALCULATE INTERACTIONS *
550              **************************/
551
552             r23              = _mm_mul_ps(rsq23,rinv23);
553
554             /* EWALD ELECTROSTATICS */
555
556             /* Analytical PME correction */
557             zeta2            = _mm_mul_ps(beta2,rsq23);
558             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
559             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
560             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
561             felec            = _mm_mul_ps(qq23,felec);
562             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
563             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv23);
564             velec            = _mm_mul_ps(qq23,velec);
565
566             /* Update potential sum for this i atom from the interaction with this j atom. */
567             velecsum         = _mm_add_ps(velecsum,velec);
568
569             fscal            = felec;
570
571              /* Update vectorial force */
572             fix2             = _mm_macc_ps(dx23,fscal,fix2);
573             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
574             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
575
576             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
577             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
578             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
579
580             /**************************
581              * CALCULATE INTERACTIONS *
582              **************************/
583
584             r31              = _mm_mul_ps(rsq31,rinv31);
585
586             /* EWALD ELECTROSTATICS */
587
588             /* Analytical PME correction */
589             zeta2            = _mm_mul_ps(beta2,rsq31);
590             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
591             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
592             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
593             felec            = _mm_mul_ps(qq31,felec);
594             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
595             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv31);
596             velec            = _mm_mul_ps(qq31,velec);
597
598             /* Update potential sum for this i atom from the interaction with this j atom. */
599             velecsum         = _mm_add_ps(velecsum,velec);
600
601             fscal            = felec;
602
603              /* Update vectorial force */
604             fix3             = _mm_macc_ps(dx31,fscal,fix3);
605             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
606             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
607
608             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
609             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
610             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
611
612             /**************************
613              * CALCULATE INTERACTIONS *
614              **************************/
615
616             r32              = _mm_mul_ps(rsq32,rinv32);
617
618             /* EWALD ELECTROSTATICS */
619
620             /* Analytical PME correction */
621             zeta2            = _mm_mul_ps(beta2,rsq32);
622             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
623             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
624             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
625             felec            = _mm_mul_ps(qq32,felec);
626             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
627             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv32);
628             velec            = _mm_mul_ps(qq32,velec);
629
630             /* Update potential sum for this i atom from the interaction with this j atom. */
631             velecsum         = _mm_add_ps(velecsum,velec);
632
633             fscal            = felec;
634
635              /* Update vectorial force */
636             fix3             = _mm_macc_ps(dx32,fscal,fix3);
637             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
638             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
639
640             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
641             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
642             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
643
644             /**************************
645              * CALCULATE INTERACTIONS *
646              **************************/
647
648             r33              = _mm_mul_ps(rsq33,rinv33);
649
650             /* EWALD ELECTROSTATICS */
651
652             /* Analytical PME correction */
653             zeta2            = _mm_mul_ps(beta2,rsq33);
654             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
655             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
656             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
657             felec            = _mm_mul_ps(qq33,felec);
658             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
659             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv33);
660             velec            = _mm_mul_ps(qq33,velec);
661
662             /* Update potential sum for this i atom from the interaction with this j atom. */
663             velecsum         = _mm_add_ps(velecsum,velec);
664
665             fscal            = felec;
666
667              /* Update vectorial force */
668             fix3             = _mm_macc_ps(dx33,fscal,fix3);
669             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
670             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
671
672             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
673             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
674             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
675
676             fjptrA             = f+j_coord_offsetA;
677             fjptrB             = f+j_coord_offsetB;
678             fjptrC             = f+j_coord_offsetC;
679             fjptrD             = f+j_coord_offsetD;
680
681             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
682                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
683                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
684
685             /* Inner loop uses 323 flops */
686         }
687
688         if(jidx<j_index_end)
689         {
690
691             /* Get j neighbor index, and coordinate index */
692             jnrlistA         = jjnr[jidx];
693             jnrlistB         = jjnr[jidx+1];
694             jnrlistC         = jjnr[jidx+2];
695             jnrlistD         = jjnr[jidx+3];
696             /* Sign of each element will be negative for non-real atoms.
697              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
698              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
699              */
700             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
701             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
702             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
703             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
704             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
705             j_coord_offsetA  = DIM*jnrA;
706             j_coord_offsetB  = DIM*jnrB;
707             j_coord_offsetC  = DIM*jnrC;
708             j_coord_offsetD  = DIM*jnrD;
709
710             /* load j atom coordinates */
711             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
712                                               x+j_coord_offsetC,x+j_coord_offsetD,
713                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
714                                               &jy2,&jz2,&jx3,&jy3,&jz3);
715
716             /* Calculate displacement vector */
717             dx00             = _mm_sub_ps(ix0,jx0);
718             dy00             = _mm_sub_ps(iy0,jy0);
719             dz00             = _mm_sub_ps(iz0,jz0);
720             dx11             = _mm_sub_ps(ix1,jx1);
721             dy11             = _mm_sub_ps(iy1,jy1);
722             dz11             = _mm_sub_ps(iz1,jz1);
723             dx12             = _mm_sub_ps(ix1,jx2);
724             dy12             = _mm_sub_ps(iy1,jy2);
725             dz12             = _mm_sub_ps(iz1,jz2);
726             dx13             = _mm_sub_ps(ix1,jx3);
727             dy13             = _mm_sub_ps(iy1,jy3);
728             dz13             = _mm_sub_ps(iz1,jz3);
729             dx21             = _mm_sub_ps(ix2,jx1);
730             dy21             = _mm_sub_ps(iy2,jy1);
731             dz21             = _mm_sub_ps(iz2,jz1);
732             dx22             = _mm_sub_ps(ix2,jx2);
733             dy22             = _mm_sub_ps(iy2,jy2);
734             dz22             = _mm_sub_ps(iz2,jz2);
735             dx23             = _mm_sub_ps(ix2,jx3);
736             dy23             = _mm_sub_ps(iy2,jy3);
737             dz23             = _mm_sub_ps(iz2,jz3);
738             dx31             = _mm_sub_ps(ix3,jx1);
739             dy31             = _mm_sub_ps(iy3,jy1);
740             dz31             = _mm_sub_ps(iz3,jz1);
741             dx32             = _mm_sub_ps(ix3,jx2);
742             dy32             = _mm_sub_ps(iy3,jy2);
743             dz32             = _mm_sub_ps(iz3,jz2);
744             dx33             = _mm_sub_ps(ix3,jx3);
745             dy33             = _mm_sub_ps(iy3,jy3);
746             dz33             = _mm_sub_ps(iz3,jz3);
747
748             /* Calculate squared distance and things based on it */
749             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
750             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
751             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
752             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
753             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
754             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
755             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
756             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
757             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
758             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
759
760             rinv00           = gmx_mm_invsqrt_ps(rsq00);
761             rinv11           = gmx_mm_invsqrt_ps(rsq11);
762             rinv12           = gmx_mm_invsqrt_ps(rsq12);
763             rinv13           = gmx_mm_invsqrt_ps(rsq13);
764             rinv21           = gmx_mm_invsqrt_ps(rsq21);
765             rinv22           = gmx_mm_invsqrt_ps(rsq22);
766             rinv23           = gmx_mm_invsqrt_ps(rsq23);
767             rinv31           = gmx_mm_invsqrt_ps(rsq31);
768             rinv32           = gmx_mm_invsqrt_ps(rsq32);
769             rinv33           = gmx_mm_invsqrt_ps(rsq33);
770
771             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
772             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
773             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
774             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
775             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
776             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
777             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
778             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
779             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
780
781             fjx0             = _mm_setzero_ps();
782             fjy0             = _mm_setzero_ps();
783             fjz0             = _mm_setzero_ps();
784             fjx1             = _mm_setzero_ps();
785             fjy1             = _mm_setzero_ps();
786             fjz1             = _mm_setzero_ps();
787             fjx2             = _mm_setzero_ps();
788             fjy2             = _mm_setzero_ps();
789             fjz2             = _mm_setzero_ps();
790             fjx3             = _mm_setzero_ps();
791             fjy3             = _mm_setzero_ps();
792             fjz3             = _mm_setzero_ps();
793
794             /**************************
795              * CALCULATE INTERACTIONS *
796              **************************/
797
798             r00              = _mm_mul_ps(rsq00,rinv00);
799             r00              = _mm_andnot_ps(dummy_mask,r00);
800
801             /* Calculate table index by multiplying r with table scale and truncate to integer */
802             rt               = _mm_mul_ps(r00,vftabscale);
803             vfitab           = _mm_cvttps_epi32(rt);
804 #ifdef __XOP__
805             vfeps            = _mm_frcz_ps(rt);
806 #else
807             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
808 #endif
809             twovfeps         = _mm_add_ps(vfeps,vfeps);
810             vfitab           = _mm_slli_epi32(vfitab,3);
811
812             /* CUBIC SPLINE TABLE DISPERSION */
813             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
814             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
815             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
816             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
817             _MM_TRANSPOSE4_PS(Y,F,G,H);
818             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
819             VV               = _mm_macc_ps(vfeps,Fp,Y);
820             vvdw6            = _mm_mul_ps(c6_00,VV);
821             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
822             fvdw6            = _mm_mul_ps(c6_00,FF);
823
824             /* CUBIC SPLINE TABLE REPULSION */
825             vfitab           = _mm_add_epi32(vfitab,ifour);
826             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
827             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
828             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
829             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
830             _MM_TRANSPOSE4_PS(Y,F,G,H);
831             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
832             VV               = _mm_macc_ps(vfeps,Fp,Y);
833             vvdw12           = _mm_mul_ps(c12_00,VV);
834             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
835             fvdw12           = _mm_mul_ps(c12_00,FF);
836             vvdw             = _mm_add_ps(vvdw12,vvdw6);
837             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
838
839             /* Update potential sum for this i atom from the interaction with this j atom. */
840             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
841             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
842
843             fscal            = fvdw;
844
845             fscal            = _mm_andnot_ps(dummy_mask,fscal);
846
847              /* Update vectorial force */
848             fix0             = _mm_macc_ps(dx00,fscal,fix0);
849             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
850             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
851
852             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
853             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
854             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
855
856             /**************************
857              * CALCULATE INTERACTIONS *
858              **************************/
859
860             r11              = _mm_mul_ps(rsq11,rinv11);
861             r11              = _mm_andnot_ps(dummy_mask,r11);
862
863             /* EWALD ELECTROSTATICS */
864
865             /* Analytical PME correction */
866             zeta2            = _mm_mul_ps(beta2,rsq11);
867             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
868             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
869             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
870             felec            = _mm_mul_ps(qq11,felec);
871             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
872             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
873             velec            = _mm_mul_ps(qq11,velec);
874
875             /* Update potential sum for this i atom from the interaction with this j atom. */
876             velec            = _mm_andnot_ps(dummy_mask,velec);
877             velecsum         = _mm_add_ps(velecsum,velec);
878
879             fscal            = felec;
880
881             fscal            = _mm_andnot_ps(dummy_mask,fscal);
882
883              /* Update vectorial force */
884             fix1             = _mm_macc_ps(dx11,fscal,fix1);
885             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
886             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
887
888             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
889             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
890             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
891
892             /**************************
893              * CALCULATE INTERACTIONS *
894              **************************/
895
896             r12              = _mm_mul_ps(rsq12,rinv12);
897             r12              = _mm_andnot_ps(dummy_mask,r12);
898
899             /* EWALD ELECTROSTATICS */
900
901             /* Analytical PME correction */
902             zeta2            = _mm_mul_ps(beta2,rsq12);
903             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
904             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
905             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
906             felec            = _mm_mul_ps(qq12,felec);
907             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
908             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
909             velec            = _mm_mul_ps(qq12,velec);
910
911             /* Update potential sum for this i atom from the interaction with this j atom. */
912             velec            = _mm_andnot_ps(dummy_mask,velec);
913             velecsum         = _mm_add_ps(velecsum,velec);
914
915             fscal            = felec;
916
917             fscal            = _mm_andnot_ps(dummy_mask,fscal);
918
919              /* Update vectorial force */
920             fix1             = _mm_macc_ps(dx12,fscal,fix1);
921             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
922             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
923
924             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
925             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
926             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
927
928             /**************************
929              * CALCULATE INTERACTIONS *
930              **************************/
931
932             r13              = _mm_mul_ps(rsq13,rinv13);
933             r13              = _mm_andnot_ps(dummy_mask,r13);
934
935             /* EWALD ELECTROSTATICS */
936
937             /* Analytical PME correction */
938             zeta2            = _mm_mul_ps(beta2,rsq13);
939             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
940             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
941             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
942             felec            = _mm_mul_ps(qq13,felec);
943             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
944             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv13);
945             velec            = _mm_mul_ps(qq13,velec);
946
947             /* Update potential sum for this i atom from the interaction with this j atom. */
948             velec            = _mm_andnot_ps(dummy_mask,velec);
949             velecsum         = _mm_add_ps(velecsum,velec);
950
951             fscal            = felec;
952
953             fscal            = _mm_andnot_ps(dummy_mask,fscal);
954
955              /* Update vectorial force */
956             fix1             = _mm_macc_ps(dx13,fscal,fix1);
957             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
958             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
959
960             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
961             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
962             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
963
964             /**************************
965              * CALCULATE INTERACTIONS *
966              **************************/
967
968             r21              = _mm_mul_ps(rsq21,rinv21);
969             r21              = _mm_andnot_ps(dummy_mask,r21);
970
971             /* EWALD ELECTROSTATICS */
972
973             /* Analytical PME correction */
974             zeta2            = _mm_mul_ps(beta2,rsq21);
975             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
976             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
977             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
978             felec            = _mm_mul_ps(qq21,felec);
979             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
980             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
981             velec            = _mm_mul_ps(qq21,velec);
982
983             /* Update potential sum for this i atom from the interaction with this j atom. */
984             velec            = _mm_andnot_ps(dummy_mask,velec);
985             velecsum         = _mm_add_ps(velecsum,velec);
986
987             fscal            = felec;
988
989             fscal            = _mm_andnot_ps(dummy_mask,fscal);
990
991              /* Update vectorial force */
992             fix2             = _mm_macc_ps(dx21,fscal,fix2);
993             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
994             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
995
996             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
997             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
998             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
999
1000             /**************************
1001              * CALCULATE INTERACTIONS *
1002              **************************/
1003
1004             r22              = _mm_mul_ps(rsq22,rinv22);
1005             r22              = _mm_andnot_ps(dummy_mask,r22);
1006
1007             /* EWALD ELECTROSTATICS */
1008
1009             /* Analytical PME correction */
1010             zeta2            = _mm_mul_ps(beta2,rsq22);
1011             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1012             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1013             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1014             felec            = _mm_mul_ps(qq22,felec);
1015             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1016             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1017             velec            = _mm_mul_ps(qq22,velec);
1018
1019             /* Update potential sum for this i atom from the interaction with this j atom. */
1020             velec            = _mm_andnot_ps(dummy_mask,velec);
1021             velecsum         = _mm_add_ps(velecsum,velec);
1022
1023             fscal            = felec;
1024
1025             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1026
1027              /* Update vectorial force */
1028             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1029             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1030             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1031
1032             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1033             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1034             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1035
1036             /**************************
1037              * CALCULATE INTERACTIONS *
1038              **************************/
1039
1040             r23              = _mm_mul_ps(rsq23,rinv23);
1041             r23              = _mm_andnot_ps(dummy_mask,r23);
1042
1043             /* EWALD ELECTROSTATICS */
1044
1045             /* Analytical PME correction */
1046             zeta2            = _mm_mul_ps(beta2,rsq23);
1047             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
1048             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1049             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1050             felec            = _mm_mul_ps(qq23,felec);
1051             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1052             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv23);
1053             velec            = _mm_mul_ps(qq23,velec);
1054
1055             /* Update potential sum for this i atom from the interaction with this j atom. */
1056             velec            = _mm_andnot_ps(dummy_mask,velec);
1057             velecsum         = _mm_add_ps(velecsum,velec);
1058
1059             fscal            = felec;
1060
1061             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1062
1063              /* Update vectorial force */
1064             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1065             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1066             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1067
1068             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1069             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1070             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1071
1072             /**************************
1073              * CALCULATE INTERACTIONS *
1074              **************************/
1075
1076             r31              = _mm_mul_ps(rsq31,rinv31);
1077             r31              = _mm_andnot_ps(dummy_mask,r31);
1078
1079             /* EWALD ELECTROSTATICS */
1080
1081             /* Analytical PME correction */
1082             zeta2            = _mm_mul_ps(beta2,rsq31);
1083             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
1084             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1085             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1086             felec            = _mm_mul_ps(qq31,felec);
1087             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1088             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv31);
1089             velec            = _mm_mul_ps(qq31,velec);
1090
1091             /* Update potential sum for this i atom from the interaction with this j atom. */
1092             velec            = _mm_andnot_ps(dummy_mask,velec);
1093             velecsum         = _mm_add_ps(velecsum,velec);
1094
1095             fscal            = felec;
1096
1097             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1098
1099              /* Update vectorial force */
1100             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1101             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1102             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1103
1104             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1105             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1106             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1107
1108             /**************************
1109              * CALCULATE INTERACTIONS *
1110              **************************/
1111
1112             r32              = _mm_mul_ps(rsq32,rinv32);
1113             r32              = _mm_andnot_ps(dummy_mask,r32);
1114
1115             /* EWALD ELECTROSTATICS */
1116
1117             /* Analytical PME correction */
1118             zeta2            = _mm_mul_ps(beta2,rsq32);
1119             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
1120             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1121             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1122             felec            = _mm_mul_ps(qq32,felec);
1123             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1124             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv32);
1125             velec            = _mm_mul_ps(qq32,velec);
1126
1127             /* Update potential sum for this i atom from the interaction with this j atom. */
1128             velec            = _mm_andnot_ps(dummy_mask,velec);
1129             velecsum         = _mm_add_ps(velecsum,velec);
1130
1131             fscal            = felec;
1132
1133             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1134
1135              /* Update vectorial force */
1136             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1137             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1138             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1139
1140             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1141             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1142             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1143
1144             /**************************
1145              * CALCULATE INTERACTIONS *
1146              **************************/
1147
1148             r33              = _mm_mul_ps(rsq33,rinv33);
1149             r33              = _mm_andnot_ps(dummy_mask,r33);
1150
1151             /* EWALD ELECTROSTATICS */
1152
1153             /* Analytical PME correction */
1154             zeta2            = _mm_mul_ps(beta2,rsq33);
1155             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
1156             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1157             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1158             felec            = _mm_mul_ps(qq33,felec);
1159             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1160             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv33);
1161             velec            = _mm_mul_ps(qq33,velec);
1162
1163             /* Update potential sum for this i atom from the interaction with this j atom. */
1164             velec            = _mm_andnot_ps(dummy_mask,velec);
1165             velecsum         = _mm_add_ps(velecsum,velec);
1166
1167             fscal            = felec;
1168
1169             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1170
1171              /* Update vectorial force */
1172             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1173             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1174             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1175
1176             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1177             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1178             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1179
1180             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1181             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1182             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1183             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1184
1185             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1186                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1187                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1188
1189             /* Inner loop uses 333 flops */
1190         }
1191
1192         /* End of innermost loop */
1193
1194         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1195                                               f+i_coord_offset,fshift+i_shift_offset);
1196
1197         ggid                        = gid[iidx];
1198         /* Update potential energies */
1199         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1200         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1201
1202         /* Increment number of inner iterations */
1203         inneriter                  += j_index_end - j_index_start;
1204
1205         /* Outer loop uses 26 flops */
1206     }
1207
1208     /* Increment number of outer iterations */
1209     outeriter        += nri;
1210
1211     /* Update outer/inner flops */
1212
1213     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*333);
1214 }
1215 /*
1216  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_128_fma_single
1217  * Electrostatics interaction: Ewald
1218  * VdW interaction:            CubicSplineTable
1219  * Geometry:                   Water4-Water4
1220  * Calculate force/pot:        Force
1221  */
1222 void
1223 nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_128_fma_single
1224                     (t_nblist                    * gmx_restrict       nlist,
1225                      rvec                        * gmx_restrict          xx,
1226                      rvec                        * gmx_restrict          ff,
1227                      t_forcerec                  * gmx_restrict          fr,
1228                      t_mdatoms                   * gmx_restrict     mdatoms,
1229                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1230                      t_nrnb                      * gmx_restrict        nrnb)
1231 {
1232     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1233      * just 0 for non-waters.
1234      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1235      * jnr indices corresponding to data put in the four positions in the SIMD register.
1236      */
1237     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1238     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1239     int              jnrA,jnrB,jnrC,jnrD;
1240     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1241     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1242     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1243     real             rcutoff_scalar;
1244     real             *shiftvec,*fshift,*x,*f;
1245     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1246     real             scratch[4*DIM];
1247     __m128           fscal,rcutoff,rcutoff2,jidxall;
1248     int              vdwioffset0;
1249     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1250     int              vdwioffset1;
1251     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1252     int              vdwioffset2;
1253     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1254     int              vdwioffset3;
1255     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1256     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1257     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1258     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1259     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1260     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1261     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1262     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1263     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1264     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1265     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1266     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1267     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1268     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1269     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1270     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1271     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1272     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1273     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1274     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1275     real             *charge;
1276     int              nvdwtype;
1277     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1278     int              *vdwtype;
1279     real             *vdwparam;
1280     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1281     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1282     __m128i          vfitab;
1283     __m128i          ifour       = _mm_set1_epi32(4);
1284     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1285     real             *vftab;
1286     __m128i          ewitab;
1287     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1288     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1289     real             *ewtab;
1290     __m128           dummy_mask,cutoff_mask;
1291     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1292     __m128           one     = _mm_set1_ps(1.0);
1293     __m128           two     = _mm_set1_ps(2.0);
1294     x                = xx[0];
1295     f                = ff[0];
1296
1297     nri              = nlist->nri;
1298     iinr             = nlist->iinr;
1299     jindex           = nlist->jindex;
1300     jjnr             = nlist->jjnr;
1301     shiftidx         = nlist->shift;
1302     gid              = nlist->gid;
1303     shiftvec         = fr->shift_vec[0];
1304     fshift           = fr->fshift[0];
1305     facel            = _mm_set1_ps(fr->epsfac);
1306     charge           = mdatoms->chargeA;
1307     nvdwtype         = fr->ntype;
1308     vdwparam         = fr->nbfp;
1309     vdwtype          = mdatoms->typeA;
1310
1311     vftab            = kernel_data->table_vdw->data;
1312     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
1313
1314     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1315     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1316     beta2            = _mm_mul_ps(beta,beta);
1317     beta3            = _mm_mul_ps(beta,beta2);
1318     ewtab            = fr->ic->tabq_coul_F;
1319     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1320     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1321
1322     /* Setup water-specific parameters */
1323     inr              = nlist->iinr[0];
1324     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1325     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1326     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1327     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1328
1329     jq1              = _mm_set1_ps(charge[inr+1]);
1330     jq2              = _mm_set1_ps(charge[inr+2]);
1331     jq3              = _mm_set1_ps(charge[inr+3]);
1332     vdwjidx0A        = 2*vdwtype[inr+0];
1333     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1334     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1335     qq11             = _mm_mul_ps(iq1,jq1);
1336     qq12             = _mm_mul_ps(iq1,jq2);
1337     qq13             = _mm_mul_ps(iq1,jq3);
1338     qq21             = _mm_mul_ps(iq2,jq1);
1339     qq22             = _mm_mul_ps(iq2,jq2);
1340     qq23             = _mm_mul_ps(iq2,jq3);
1341     qq31             = _mm_mul_ps(iq3,jq1);
1342     qq32             = _mm_mul_ps(iq3,jq2);
1343     qq33             = _mm_mul_ps(iq3,jq3);
1344
1345     /* Avoid stupid compiler warnings */
1346     jnrA = jnrB = jnrC = jnrD = 0;
1347     j_coord_offsetA = 0;
1348     j_coord_offsetB = 0;
1349     j_coord_offsetC = 0;
1350     j_coord_offsetD = 0;
1351
1352     outeriter        = 0;
1353     inneriter        = 0;
1354
1355     for(iidx=0;iidx<4*DIM;iidx++)
1356     {
1357         scratch[iidx] = 0.0;
1358     }
1359
1360     /* Start outer loop over neighborlists */
1361     for(iidx=0; iidx<nri; iidx++)
1362     {
1363         /* Load shift vector for this list */
1364         i_shift_offset   = DIM*shiftidx[iidx];
1365
1366         /* Load limits for loop over neighbors */
1367         j_index_start    = jindex[iidx];
1368         j_index_end      = jindex[iidx+1];
1369
1370         /* Get outer coordinate index */
1371         inr              = iinr[iidx];
1372         i_coord_offset   = DIM*inr;
1373
1374         /* Load i particle coords and add shift vector */
1375         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1376                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1377
1378         fix0             = _mm_setzero_ps();
1379         fiy0             = _mm_setzero_ps();
1380         fiz0             = _mm_setzero_ps();
1381         fix1             = _mm_setzero_ps();
1382         fiy1             = _mm_setzero_ps();
1383         fiz1             = _mm_setzero_ps();
1384         fix2             = _mm_setzero_ps();
1385         fiy2             = _mm_setzero_ps();
1386         fiz2             = _mm_setzero_ps();
1387         fix3             = _mm_setzero_ps();
1388         fiy3             = _mm_setzero_ps();
1389         fiz3             = _mm_setzero_ps();
1390
1391         /* Start inner kernel loop */
1392         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1393         {
1394
1395             /* Get j neighbor index, and coordinate index */
1396             jnrA             = jjnr[jidx];
1397             jnrB             = jjnr[jidx+1];
1398             jnrC             = jjnr[jidx+2];
1399             jnrD             = jjnr[jidx+3];
1400             j_coord_offsetA  = DIM*jnrA;
1401             j_coord_offsetB  = DIM*jnrB;
1402             j_coord_offsetC  = DIM*jnrC;
1403             j_coord_offsetD  = DIM*jnrD;
1404
1405             /* load j atom coordinates */
1406             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1407                                               x+j_coord_offsetC,x+j_coord_offsetD,
1408                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1409                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1410
1411             /* Calculate displacement vector */
1412             dx00             = _mm_sub_ps(ix0,jx0);
1413             dy00             = _mm_sub_ps(iy0,jy0);
1414             dz00             = _mm_sub_ps(iz0,jz0);
1415             dx11             = _mm_sub_ps(ix1,jx1);
1416             dy11             = _mm_sub_ps(iy1,jy1);
1417             dz11             = _mm_sub_ps(iz1,jz1);
1418             dx12             = _mm_sub_ps(ix1,jx2);
1419             dy12             = _mm_sub_ps(iy1,jy2);
1420             dz12             = _mm_sub_ps(iz1,jz2);
1421             dx13             = _mm_sub_ps(ix1,jx3);
1422             dy13             = _mm_sub_ps(iy1,jy3);
1423             dz13             = _mm_sub_ps(iz1,jz3);
1424             dx21             = _mm_sub_ps(ix2,jx1);
1425             dy21             = _mm_sub_ps(iy2,jy1);
1426             dz21             = _mm_sub_ps(iz2,jz1);
1427             dx22             = _mm_sub_ps(ix2,jx2);
1428             dy22             = _mm_sub_ps(iy2,jy2);
1429             dz22             = _mm_sub_ps(iz2,jz2);
1430             dx23             = _mm_sub_ps(ix2,jx3);
1431             dy23             = _mm_sub_ps(iy2,jy3);
1432             dz23             = _mm_sub_ps(iz2,jz3);
1433             dx31             = _mm_sub_ps(ix3,jx1);
1434             dy31             = _mm_sub_ps(iy3,jy1);
1435             dz31             = _mm_sub_ps(iz3,jz1);
1436             dx32             = _mm_sub_ps(ix3,jx2);
1437             dy32             = _mm_sub_ps(iy3,jy2);
1438             dz32             = _mm_sub_ps(iz3,jz2);
1439             dx33             = _mm_sub_ps(ix3,jx3);
1440             dy33             = _mm_sub_ps(iy3,jy3);
1441             dz33             = _mm_sub_ps(iz3,jz3);
1442
1443             /* Calculate squared distance and things based on it */
1444             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1445             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1446             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1447             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1448             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1449             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1450             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1451             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1452             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1453             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1454
1455             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1456             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1457             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1458             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1459             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1460             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1461             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1462             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1463             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1464             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1465
1466             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1467             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1468             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1469             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1470             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1471             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1472             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1473             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1474             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1475
1476             fjx0             = _mm_setzero_ps();
1477             fjy0             = _mm_setzero_ps();
1478             fjz0             = _mm_setzero_ps();
1479             fjx1             = _mm_setzero_ps();
1480             fjy1             = _mm_setzero_ps();
1481             fjz1             = _mm_setzero_ps();
1482             fjx2             = _mm_setzero_ps();
1483             fjy2             = _mm_setzero_ps();
1484             fjz2             = _mm_setzero_ps();
1485             fjx3             = _mm_setzero_ps();
1486             fjy3             = _mm_setzero_ps();
1487             fjz3             = _mm_setzero_ps();
1488
1489             /**************************
1490              * CALCULATE INTERACTIONS *
1491              **************************/
1492
1493             r00              = _mm_mul_ps(rsq00,rinv00);
1494
1495             /* Calculate table index by multiplying r with table scale and truncate to integer */
1496             rt               = _mm_mul_ps(r00,vftabscale);
1497             vfitab           = _mm_cvttps_epi32(rt);
1498 #ifdef __XOP__
1499             vfeps            = _mm_frcz_ps(rt);
1500 #else
1501             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1502 #endif
1503             twovfeps         = _mm_add_ps(vfeps,vfeps);
1504             vfitab           = _mm_slli_epi32(vfitab,3);
1505
1506             /* CUBIC SPLINE TABLE DISPERSION */
1507             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1508             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1509             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1510             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1511             _MM_TRANSPOSE4_PS(Y,F,G,H);
1512             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1513             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1514             fvdw6            = _mm_mul_ps(c6_00,FF);
1515
1516             /* CUBIC SPLINE TABLE REPULSION */
1517             vfitab           = _mm_add_epi32(vfitab,ifour);
1518             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1519             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1520             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1521             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1522             _MM_TRANSPOSE4_PS(Y,F,G,H);
1523             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1524             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1525             fvdw12           = _mm_mul_ps(c12_00,FF);
1526             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1527
1528             fscal            = fvdw;
1529
1530              /* Update vectorial force */
1531             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1532             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1533             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1534
1535             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1536             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1537             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1538
1539             /**************************
1540              * CALCULATE INTERACTIONS *
1541              **************************/
1542
1543             r11              = _mm_mul_ps(rsq11,rinv11);
1544
1545             /* EWALD ELECTROSTATICS */
1546
1547             /* Analytical PME correction */
1548             zeta2            = _mm_mul_ps(beta2,rsq11);
1549             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1550             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1551             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1552             felec            = _mm_mul_ps(qq11,felec);
1553
1554             fscal            = felec;
1555
1556              /* Update vectorial force */
1557             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1558             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1559             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1560
1561             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1562             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1563             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1564
1565             /**************************
1566              * CALCULATE INTERACTIONS *
1567              **************************/
1568
1569             r12              = _mm_mul_ps(rsq12,rinv12);
1570
1571             /* EWALD ELECTROSTATICS */
1572
1573             /* Analytical PME correction */
1574             zeta2            = _mm_mul_ps(beta2,rsq12);
1575             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1576             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1577             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1578             felec            = _mm_mul_ps(qq12,felec);
1579
1580             fscal            = felec;
1581
1582              /* Update vectorial force */
1583             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1584             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1585             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1586
1587             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1588             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1589             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1590
1591             /**************************
1592              * CALCULATE INTERACTIONS *
1593              **************************/
1594
1595             r13              = _mm_mul_ps(rsq13,rinv13);
1596
1597             /* EWALD ELECTROSTATICS */
1598
1599             /* Analytical PME correction */
1600             zeta2            = _mm_mul_ps(beta2,rsq13);
1601             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
1602             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1603             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1604             felec            = _mm_mul_ps(qq13,felec);
1605
1606             fscal            = felec;
1607
1608              /* Update vectorial force */
1609             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1610             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1611             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1612
1613             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1614             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1615             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1616
1617             /**************************
1618              * CALCULATE INTERACTIONS *
1619              **************************/
1620
1621             r21              = _mm_mul_ps(rsq21,rinv21);
1622
1623             /* EWALD ELECTROSTATICS */
1624
1625             /* Analytical PME correction */
1626             zeta2            = _mm_mul_ps(beta2,rsq21);
1627             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1628             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1629             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1630             felec            = _mm_mul_ps(qq21,felec);
1631
1632             fscal            = felec;
1633
1634              /* Update vectorial force */
1635             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1636             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1637             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1638
1639             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1640             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1641             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1642
1643             /**************************
1644              * CALCULATE INTERACTIONS *
1645              **************************/
1646
1647             r22              = _mm_mul_ps(rsq22,rinv22);
1648
1649             /* EWALD ELECTROSTATICS */
1650
1651             /* Analytical PME correction */
1652             zeta2            = _mm_mul_ps(beta2,rsq22);
1653             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1654             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1655             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1656             felec            = _mm_mul_ps(qq22,felec);
1657
1658             fscal            = felec;
1659
1660              /* Update vectorial force */
1661             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1662             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1663             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1664
1665             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1666             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1667             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1668
1669             /**************************
1670              * CALCULATE INTERACTIONS *
1671              **************************/
1672
1673             r23              = _mm_mul_ps(rsq23,rinv23);
1674
1675             /* EWALD ELECTROSTATICS */
1676
1677             /* Analytical PME correction */
1678             zeta2            = _mm_mul_ps(beta2,rsq23);
1679             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
1680             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1681             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1682             felec            = _mm_mul_ps(qq23,felec);
1683
1684             fscal            = felec;
1685
1686              /* Update vectorial force */
1687             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1688             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1689             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1690
1691             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1692             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1693             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1694
1695             /**************************
1696              * CALCULATE INTERACTIONS *
1697              **************************/
1698
1699             r31              = _mm_mul_ps(rsq31,rinv31);
1700
1701             /* EWALD ELECTROSTATICS */
1702
1703             /* Analytical PME correction */
1704             zeta2            = _mm_mul_ps(beta2,rsq31);
1705             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
1706             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1707             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1708             felec            = _mm_mul_ps(qq31,felec);
1709
1710             fscal            = felec;
1711
1712              /* Update vectorial force */
1713             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1714             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1715             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1716
1717             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1718             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1719             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1720
1721             /**************************
1722              * CALCULATE INTERACTIONS *
1723              **************************/
1724
1725             r32              = _mm_mul_ps(rsq32,rinv32);
1726
1727             /* EWALD ELECTROSTATICS */
1728
1729             /* Analytical PME correction */
1730             zeta2            = _mm_mul_ps(beta2,rsq32);
1731             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
1732             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1733             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1734             felec            = _mm_mul_ps(qq32,felec);
1735
1736             fscal            = felec;
1737
1738              /* Update vectorial force */
1739             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1740             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1741             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1742
1743             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1744             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1745             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1746
1747             /**************************
1748              * CALCULATE INTERACTIONS *
1749              **************************/
1750
1751             r33              = _mm_mul_ps(rsq33,rinv33);
1752
1753             /* EWALD ELECTROSTATICS */
1754
1755             /* Analytical PME correction */
1756             zeta2            = _mm_mul_ps(beta2,rsq33);
1757             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
1758             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1759             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1760             felec            = _mm_mul_ps(qq33,felec);
1761
1762             fscal            = felec;
1763
1764              /* Update vectorial force */
1765             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1766             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1767             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1768
1769             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1770             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1771             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1772
1773             fjptrA             = f+j_coord_offsetA;
1774             fjptrB             = f+j_coord_offsetB;
1775             fjptrC             = f+j_coord_offsetC;
1776             fjptrD             = f+j_coord_offsetD;
1777
1778             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1779                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1780                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1781
1782             /* Inner loop uses 306 flops */
1783         }
1784
1785         if(jidx<j_index_end)
1786         {
1787
1788             /* Get j neighbor index, and coordinate index */
1789             jnrlistA         = jjnr[jidx];
1790             jnrlistB         = jjnr[jidx+1];
1791             jnrlistC         = jjnr[jidx+2];
1792             jnrlistD         = jjnr[jidx+3];
1793             /* Sign of each element will be negative for non-real atoms.
1794              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1795              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1796              */
1797             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1798             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1799             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1800             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1801             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1802             j_coord_offsetA  = DIM*jnrA;
1803             j_coord_offsetB  = DIM*jnrB;
1804             j_coord_offsetC  = DIM*jnrC;
1805             j_coord_offsetD  = DIM*jnrD;
1806
1807             /* load j atom coordinates */
1808             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1809                                               x+j_coord_offsetC,x+j_coord_offsetD,
1810                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1811                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1812
1813             /* Calculate displacement vector */
1814             dx00             = _mm_sub_ps(ix0,jx0);
1815             dy00             = _mm_sub_ps(iy0,jy0);
1816             dz00             = _mm_sub_ps(iz0,jz0);
1817             dx11             = _mm_sub_ps(ix1,jx1);
1818             dy11             = _mm_sub_ps(iy1,jy1);
1819             dz11             = _mm_sub_ps(iz1,jz1);
1820             dx12             = _mm_sub_ps(ix1,jx2);
1821             dy12             = _mm_sub_ps(iy1,jy2);
1822             dz12             = _mm_sub_ps(iz1,jz2);
1823             dx13             = _mm_sub_ps(ix1,jx3);
1824             dy13             = _mm_sub_ps(iy1,jy3);
1825             dz13             = _mm_sub_ps(iz1,jz3);
1826             dx21             = _mm_sub_ps(ix2,jx1);
1827             dy21             = _mm_sub_ps(iy2,jy1);
1828             dz21             = _mm_sub_ps(iz2,jz1);
1829             dx22             = _mm_sub_ps(ix2,jx2);
1830             dy22             = _mm_sub_ps(iy2,jy2);
1831             dz22             = _mm_sub_ps(iz2,jz2);
1832             dx23             = _mm_sub_ps(ix2,jx3);
1833             dy23             = _mm_sub_ps(iy2,jy3);
1834             dz23             = _mm_sub_ps(iz2,jz3);
1835             dx31             = _mm_sub_ps(ix3,jx1);
1836             dy31             = _mm_sub_ps(iy3,jy1);
1837             dz31             = _mm_sub_ps(iz3,jz1);
1838             dx32             = _mm_sub_ps(ix3,jx2);
1839             dy32             = _mm_sub_ps(iy3,jy2);
1840             dz32             = _mm_sub_ps(iz3,jz2);
1841             dx33             = _mm_sub_ps(ix3,jx3);
1842             dy33             = _mm_sub_ps(iy3,jy3);
1843             dz33             = _mm_sub_ps(iz3,jz3);
1844
1845             /* Calculate squared distance and things based on it */
1846             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1847             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1848             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1849             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1850             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1851             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1852             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1853             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1854             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1855             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1856
1857             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1858             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1859             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1860             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1861             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1862             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1863             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1864             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1865             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1866             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1867
1868             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1869             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1870             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1871             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1872             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1873             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1874             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1875             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1876             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1877
1878             fjx0             = _mm_setzero_ps();
1879             fjy0             = _mm_setzero_ps();
1880             fjz0             = _mm_setzero_ps();
1881             fjx1             = _mm_setzero_ps();
1882             fjy1             = _mm_setzero_ps();
1883             fjz1             = _mm_setzero_ps();
1884             fjx2             = _mm_setzero_ps();
1885             fjy2             = _mm_setzero_ps();
1886             fjz2             = _mm_setzero_ps();
1887             fjx3             = _mm_setzero_ps();
1888             fjy3             = _mm_setzero_ps();
1889             fjz3             = _mm_setzero_ps();
1890
1891             /**************************
1892              * CALCULATE INTERACTIONS *
1893              **************************/
1894
1895             r00              = _mm_mul_ps(rsq00,rinv00);
1896             r00              = _mm_andnot_ps(dummy_mask,r00);
1897
1898             /* Calculate table index by multiplying r with table scale and truncate to integer */
1899             rt               = _mm_mul_ps(r00,vftabscale);
1900             vfitab           = _mm_cvttps_epi32(rt);
1901 #ifdef __XOP__
1902             vfeps            = _mm_frcz_ps(rt);
1903 #else
1904             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1905 #endif
1906             twovfeps         = _mm_add_ps(vfeps,vfeps);
1907             vfitab           = _mm_slli_epi32(vfitab,3);
1908
1909             /* CUBIC SPLINE TABLE DISPERSION */
1910             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1911             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1912             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1913             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1914             _MM_TRANSPOSE4_PS(Y,F,G,H);
1915             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1916             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1917             fvdw6            = _mm_mul_ps(c6_00,FF);
1918
1919             /* CUBIC SPLINE TABLE REPULSION */
1920             vfitab           = _mm_add_epi32(vfitab,ifour);
1921             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1922             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1923             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1924             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1925             _MM_TRANSPOSE4_PS(Y,F,G,H);
1926             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1927             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1928             fvdw12           = _mm_mul_ps(c12_00,FF);
1929             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1930
1931             fscal            = fvdw;
1932
1933             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1934
1935              /* Update vectorial force */
1936             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1937             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1938             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1939
1940             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1941             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1942             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1943
1944             /**************************
1945              * CALCULATE INTERACTIONS *
1946              **************************/
1947
1948             r11              = _mm_mul_ps(rsq11,rinv11);
1949             r11              = _mm_andnot_ps(dummy_mask,r11);
1950
1951             /* EWALD ELECTROSTATICS */
1952
1953             /* Analytical PME correction */
1954             zeta2            = _mm_mul_ps(beta2,rsq11);
1955             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1956             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1957             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1958             felec            = _mm_mul_ps(qq11,felec);
1959
1960             fscal            = felec;
1961
1962             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1963
1964              /* Update vectorial force */
1965             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1966             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1967             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1968
1969             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1970             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1971             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1972
1973             /**************************
1974              * CALCULATE INTERACTIONS *
1975              **************************/
1976
1977             r12              = _mm_mul_ps(rsq12,rinv12);
1978             r12              = _mm_andnot_ps(dummy_mask,r12);
1979
1980             /* EWALD ELECTROSTATICS */
1981
1982             /* Analytical PME correction */
1983             zeta2            = _mm_mul_ps(beta2,rsq12);
1984             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1985             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1986             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1987             felec            = _mm_mul_ps(qq12,felec);
1988
1989             fscal            = felec;
1990
1991             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1992
1993              /* Update vectorial force */
1994             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1995             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1996             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1997
1998             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1999             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
2000             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
2001
2002             /**************************
2003              * CALCULATE INTERACTIONS *
2004              **************************/
2005
2006             r13              = _mm_mul_ps(rsq13,rinv13);
2007             r13              = _mm_andnot_ps(dummy_mask,r13);
2008
2009             /* EWALD ELECTROSTATICS */
2010
2011             /* Analytical PME correction */
2012             zeta2            = _mm_mul_ps(beta2,rsq13);
2013             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
2014             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2015             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2016             felec            = _mm_mul_ps(qq13,felec);
2017
2018             fscal            = felec;
2019
2020             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2021
2022              /* Update vectorial force */
2023             fix1             = _mm_macc_ps(dx13,fscal,fix1);
2024             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
2025             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
2026
2027             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
2028             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
2029             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
2030
2031             /**************************
2032              * CALCULATE INTERACTIONS *
2033              **************************/
2034
2035             r21              = _mm_mul_ps(rsq21,rinv21);
2036             r21              = _mm_andnot_ps(dummy_mask,r21);
2037
2038             /* EWALD ELECTROSTATICS */
2039
2040             /* Analytical PME correction */
2041             zeta2            = _mm_mul_ps(beta2,rsq21);
2042             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
2043             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2044             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2045             felec            = _mm_mul_ps(qq21,felec);
2046
2047             fscal            = felec;
2048
2049             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2050
2051              /* Update vectorial force */
2052             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2053             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2054             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2055
2056             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2057             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2058             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2059
2060             /**************************
2061              * CALCULATE INTERACTIONS *
2062              **************************/
2063
2064             r22              = _mm_mul_ps(rsq22,rinv22);
2065             r22              = _mm_andnot_ps(dummy_mask,r22);
2066
2067             /* EWALD ELECTROSTATICS */
2068
2069             /* Analytical PME correction */
2070             zeta2            = _mm_mul_ps(beta2,rsq22);
2071             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
2072             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2073             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2074             felec            = _mm_mul_ps(qq22,felec);
2075
2076             fscal            = felec;
2077
2078             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2079
2080              /* Update vectorial force */
2081             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2082             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2083             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2084
2085             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2086             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2087             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2088
2089             /**************************
2090              * CALCULATE INTERACTIONS *
2091              **************************/
2092
2093             r23              = _mm_mul_ps(rsq23,rinv23);
2094             r23              = _mm_andnot_ps(dummy_mask,r23);
2095
2096             /* EWALD ELECTROSTATICS */
2097
2098             /* Analytical PME correction */
2099             zeta2            = _mm_mul_ps(beta2,rsq23);
2100             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
2101             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2102             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2103             felec            = _mm_mul_ps(qq23,felec);
2104
2105             fscal            = felec;
2106
2107             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2108
2109              /* Update vectorial force */
2110             fix2             = _mm_macc_ps(dx23,fscal,fix2);
2111             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
2112             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
2113
2114             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
2115             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
2116             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
2117
2118             /**************************
2119              * CALCULATE INTERACTIONS *
2120              **************************/
2121
2122             r31              = _mm_mul_ps(rsq31,rinv31);
2123             r31              = _mm_andnot_ps(dummy_mask,r31);
2124
2125             /* EWALD ELECTROSTATICS */
2126
2127             /* Analytical PME correction */
2128             zeta2            = _mm_mul_ps(beta2,rsq31);
2129             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
2130             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2131             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2132             felec            = _mm_mul_ps(qq31,felec);
2133
2134             fscal            = felec;
2135
2136             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2137
2138              /* Update vectorial force */
2139             fix3             = _mm_macc_ps(dx31,fscal,fix3);
2140             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
2141             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
2142
2143             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
2144             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
2145             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
2146
2147             /**************************
2148              * CALCULATE INTERACTIONS *
2149              **************************/
2150
2151             r32              = _mm_mul_ps(rsq32,rinv32);
2152             r32              = _mm_andnot_ps(dummy_mask,r32);
2153
2154             /* EWALD ELECTROSTATICS */
2155
2156             /* Analytical PME correction */
2157             zeta2            = _mm_mul_ps(beta2,rsq32);
2158             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
2159             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2160             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2161             felec            = _mm_mul_ps(qq32,felec);
2162
2163             fscal            = felec;
2164
2165             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2166
2167              /* Update vectorial force */
2168             fix3             = _mm_macc_ps(dx32,fscal,fix3);
2169             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
2170             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
2171
2172             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
2173             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
2174             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
2175
2176             /**************************
2177              * CALCULATE INTERACTIONS *
2178              **************************/
2179
2180             r33              = _mm_mul_ps(rsq33,rinv33);
2181             r33              = _mm_andnot_ps(dummy_mask,r33);
2182
2183             /* EWALD ELECTROSTATICS */
2184
2185             /* Analytical PME correction */
2186             zeta2            = _mm_mul_ps(beta2,rsq33);
2187             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
2188             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2189             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2190             felec            = _mm_mul_ps(qq33,felec);
2191
2192             fscal            = felec;
2193
2194             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2195
2196              /* Update vectorial force */
2197             fix3             = _mm_macc_ps(dx33,fscal,fix3);
2198             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
2199             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
2200
2201             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
2202             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
2203             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
2204
2205             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2206             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2207             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2208             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2209
2210             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2211                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2212                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2213
2214             /* Inner loop uses 316 flops */
2215         }
2216
2217         /* End of innermost loop */
2218
2219         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2220                                               f+i_coord_offset,fshift+i_shift_offset);
2221
2222         /* Increment number of inner iterations */
2223         inneriter                  += j_index_end - j_index_start;
2224
2225         /* Outer loop uses 24 flops */
2226     }
2227
2228     /* Increment number of outer iterations */
2229     outeriter        += nri;
2230
2231     /* Update outer/inner flops */
2232
2233     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*316);
2234 }