28453eadd878c800cf997b8b5e440d964abb8eea
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEw_VdwCSTab_GeomW4W4_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_128_fma_single
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            CubicSplineTable
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEw_VdwCSTab_GeomW4W4_VF_avx_128_fma_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwioffset3;
90     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
110     real             *charge;
111     int              nvdwtype;
112     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113     int              *vdwtype;
114     real             *vdwparam;
115     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
116     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
117     __m128i          vfitab;
118     __m128i          ifour       = _mm_set1_epi32(4);
119     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
120     real             *vftab;
121     __m128i          ewitab;
122     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
123     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
124     real             *ewtab;
125     __m128           dummy_mask,cutoff_mask;
126     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
127     __m128           one     = _mm_set1_ps(1.0);
128     __m128           two     = _mm_set1_ps(2.0);
129     x                = xx[0];
130     f                = ff[0];
131
132     nri              = nlist->nri;
133     iinr             = nlist->iinr;
134     jindex           = nlist->jindex;
135     jjnr             = nlist->jjnr;
136     shiftidx         = nlist->shift;
137     gid              = nlist->gid;
138     shiftvec         = fr->shift_vec[0];
139     fshift           = fr->fshift[0];
140     facel            = _mm_set1_ps(fr->epsfac);
141     charge           = mdatoms->chargeA;
142     nvdwtype         = fr->ntype;
143     vdwparam         = fr->nbfp;
144     vdwtype          = mdatoms->typeA;
145
146     vftab            = kernel_data->table_vdw->data;
147     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
148
149     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
150     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
151     beta2            = _mm_mul_ps(beta,beta);
152     beta3            = _mm_mul_ps(beta,beta2);
153     ewtab            = fr->ic->tabq_coul_FDV0;
154     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
155     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
156
157     /* Setup water-specific parameters */
158     inr              = nlist->iinr[0];
159     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
160     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
161     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
162     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
163
164     jq1              = _mm_set1_ps(charge[inr+1]);
165     jq2              = _mm_set1_ps(charge[inr+2]);
166     jq3              = _mm_set1_ps(charge[inr+3]);
167     vdwjidx0A        = 2*vdwtype[inr+0];
168     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
169     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
170     qq11             = _mm_mul_ps(iq1,jq1);
171     qq12             = _mm_mul_ps(iq1,jq2);
172     qq13             = _mm_mul_ps(iq1,jq3);
173     qq21             = _mm_mul_ps(iq2,jq1);
174     qq22             = _mm_mul_ps(iq2,jq2);
175     qq23             = _mm_mul_ps(iq2,jq3);
176     qq31             = _mm_mul_ps(iq3,jq1);
177     qq32             = _mm_mul_ps(iq3,jq2);
178     qq33             = _mm_mul_ps(iq3,jq3);
179
180     /* Avoid stupid compiler warnings */
181     jnrA = jnrB = jnrC = jnrD = 0;
182     j_coord_offsetA = 0;
183     j_coord_offsetB = 0;
184     j_coord_offsetC = 0;
185     j_coord_offsetD = 0;
186
187     outeriter        = 0;
188     inneriter        = 0;
189
190     for(iidx=0;iidx<4*DIM;iidx++)
191     {
192         scratch[iidx] = 0.0;
193     }
194
195     /* Start outer loop over neighborlists */
196     for(iidx=0; iidx<nri; iidx++)
197     {
198         /* Load shift vector for this list */
199         i_shift_offset   = DIM*shiftidx[iidx];
200
201         /* Load limits for loop over neighbors */
202         j_index_start    = jindex[iidx];
203         j_index_end      = jindex[iidx+1];
204
205         /* Get outer coordinate index */
206         inr              = iinr[iidx];
207         i_coord_offset   = DIM*inr;
208
209         /* Load i particle coords and add shift vector */
210         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
211                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
212
213         fix0             = _mm_setzero_ps();
214         fiy0             = _mm_setzero_ps();
215         fiz0             = _mm_setzero_ps();
216         fix1             = _mm_setzero_ps();
217         fiy1             = _mm_setzero_ps();
218         fiz1             = _mm_setzero_ps();
219         fix2             = _mm_setzero_ps();
220         fiy2             = _mm_setzero_ps();
221         fiz2             = _mm_setzero_ps();
222         fix3             = _mm_setzero_ps();
223         fiy3             = _mm_setzero_ps();
224         fiz3             = _mm_setzero_ps();
225
226         /* Reset potential sums */
227         velecsum         = _mm_setzero_ps();
228         vvdwsum          = _mm_setzero_ps();
229
230         /* Start inner kernel loop */
231         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
232         {
233
234             /* Get j neighbor index, and coordinate index */
235             jnrA             = jjnr[jidx];
236             jnrB             = jjnr[jidx+1];
237             jnrC             = jjnr[jidx+2];
238             jnrD             = jjnr[jidx+3];
239             j_coord_offsetA  = DIM*jnrA;
240             j_coord_offsetB  = DIM*jnrB;
241             j_coord_offsetC  = DIM*jnrC;
242             j_coord_offsetD  = DIM*jnrD;
243
244             /* load j atom coordinates */
245             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
246                                               x+j_coord_offsetC,x+j_coord_offsetD,
247                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
248                                               &jy2,&jz2,&jx3,&jy3,&jz3);
249
250             /* Calculate displacement vector */
251             dx00             = _mm_sub_ps(ix0,jx0);
252             dy00             = _mm_sub_ps(iy0,jy0);
253             dz00             = _mm_sub_ps(iz0,jz0);
254             dx11             = _mm_sub_ps(ix1,jx1);
255             dy11             = _mm_sub_ps(iy1,jy1);
256             dz11             = _mm_sub_ps(iz1,jz1);
257             dx12             = _mm_sub_ps(ix1,jx2);
258             dy12             = _mm_sub_ps(iy1,jy2);
259             dz12             = _mm_sub_ps(iz1,jz2);
260             dx13             = _mm_sub_ps(ix1,jx3);
261             dy13             = _mm_sub_ps(iy1,jy3);
262             dz13             = _mm_sub_ps(iz1,jz3);
263             dx21             = _mm_sub_ps(ix2,jx1);
264             dy21             = _mm_sub_ps(iy2,jy1);
265             dz21             = _mm_sub_ps(iz2,jz1);
266             dx22             = _mm_sub_ps(ix2,jx2);
267             dy22             = _mm_sub_ps(iy2,jy2);
268             dz22             = _mm_sub_ps(iz2,jz2);
269             dx23             = _mm_sub_ps(ix2,jx3);
270             dy23             = _mm_sub_ps(iy2,jy3);
271             dz23             = _mm_sub_ps(iz2,jz3);
272             dx31             = _mm_sub_ps(ix3,jx1);
273             dy31             = _mm_sub_ps(iy3,jy1);
274             dz31             = _mm_sub_ps(iz3,jz1);
275             dx32             = _mm_sub_ps(ix3,jx2);
276             dy32             = _mm_sub_ps(iy3,jy2);
277             dz32             = _mm_sub_ps(iz3,jz2);
278             dx33             = _mm_sub_ps(ix3,jx3);
279             dy33             = _mm_sub_ps(iy3,jy3);
280             dz33             = _mm_sub_ps(iz3,jz3);
281
282             /* Calculate squared distance and things based on it */
283             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
284             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
285             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
286             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
287             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
288             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
289             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
290             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
291             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
292             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
293
294             rinv00           = gmx_mm_invsqrt_ps(rsq00);
295             rinv11           = gmx_mm_invsqrt_ps(rsq11);
296             rinv12           = gmx_mm_invsqrt_ps(rsq12);
297             rinv13           = gmx_mm_invsqrt_ps(rsq13);
298             rinv21           = gmx_mm_invsqrt_ps(rsq21);
299             rinv22           = gmx_mm_invsqrt_ps(rsq22);
300             rinv23           = gmx_mm_invsqrt_ps(rsq23);
301             rinv31           = gmx_mm_invsqrt_ps(rsq31);
302             rinv32           = gmx_mm_invsqrt_ps(rsq32);
303             rinv33           = gmx_mm_invsqrt_ps(rsq33);
304
305             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
306             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
307             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
308             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
309             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
310             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
311             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
312             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
313             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
314
315             fjx0             = _mm_setzero_ps();
316             fjy0             = _mm_setzero_ps();
317             fjz0             = _mm_setzero_ps();
318             fjx1             = _mm_setzero_ps();
319             fjy1             = _mm_setzero_ps();
320             fjz1             = _mm_setzero_ps();
321             fjx2             = _mm_setzero_ps();
322             fjy2             = _mm_setzero_ps();
323             fjz2             = _mm_setzero_ps();
324             fjx3             = _mm_setzero_ps();
325             fjy3             = _mm_setzero_ps();
326             fjz3             = _mm_setzero_ps();
327
328             /**************************
329              * CALCULATE INTERACTIONS *
330              **************************/
331
332             r00              = _mm_mul_ps(rsq00,rinv00);
333
334             /* Calculate table index by multiplying r with table scale and truncate to integer */
335             rt               = _mm_mul_ps(r00,vftabscale);
336             vfitab           = _mm_cvttps_epi32(rt);
337 #ifdef __XOP__
338             vfeps            = _mm_frcz_ps(rt);
339 #else
340             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
341 #endif
342             twovfeps         = _mm_add_ps(vfeps,vfeps);
343             vfitab           = _mm_slli_epi32(vfitab,3);
344
345             /* CUBIC SPLINE TABLE DISPERSION */
346             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
347             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
348             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
349             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
350             _MM_TRANSPOSE4_PS(Y,F,G,H);
351             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
352             VV               = _mm_macc_ps(vfeps,Fp,Y);
353             vvdw6            = _mm_mul_ps(c6_00,VV);
354             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
355             fvdw6            = _mm_mul_ps(c6_00,FF);
356
357             /* CUBIC SPLINE TABLE REPULSION */
358             vfitab           = _mm_add_epi32(vfitab,ifour);
359             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
360             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
361             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
362             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
363             _MM_TRANSPOSE4_PS(Y,F,G,H);
364             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
365             VV               = _mm_macc_ps(vfeps,Fp,Y);
366             vvdw12           = _mm_mul_ps(c12_00,VV);
367             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
368             fvdw12           = _mm_mul_ps(c12_00,FF);
369             vvdw             = _mm_add_ps(vvdw12,vvdw6);
370             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
371
372             /* Update potential sum for this i atom from the interaction with this j atom. */
373             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
374
375             fscal            = fvdw;
376
377              /* Update vectorial force */
378             fix0             = _mm_macc_ps(dx00,fscal,fix0);
379             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
380             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
381
382             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
383             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
384             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
385
386             /**************************
387              * CALCULATE INTERACTIONS *
388              **************************/
389
390             r11              = _mm_mul_ps(rsq11,rinv11);
391
392             /* EWALD ELECTROSTATICS */
393
394             /* Analytical PME correction */
395             zeta2            = _mm_mul_ps(beta2,rsq11);
396             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
397             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
398             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
399             felec            = _mm_mul_ps(qq11,felec);
400             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
401             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
402             velec            = _mm_mul_ps(qq11,velec);
403
404             /* Update potential sum for this i atom from the interaction with this j atom. */
405             velecsum         = _mm_add_ps(velecsum,velec);
406
407             fscal            = felec;
408
409              /* Update vectorial force */
410             fix1             = _mm_macc_ps(dx11,fscal,fix1);
411             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
412             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
413
414             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
415             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
416             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
417
418             /**************************
419              * CALCULATE INTERACTIONS *
420              **************************/
421
422             r12              = _mm_mul_ps(rsq12,rinv12);
423
424             /* EWALD ELECTROSTATICS */
425
426             /* Analytical PME correction */
427             zeta2            = _mm_mul_ps(beta2,rsq12);
428             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
429             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
430             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
431             felec            = _mm_mul_ps(qq12,felec);
432             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
433             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
434             velec            = _mm_mul_ps(qq12,velec);
435
436             /* Update potential sum for this i atom from the interaction with this j atom. */
437             velecsum         = _mm_add_ps(velecsum,velec);
438
439             fscal            = felec;
440
441              /* Update vectorial force */
442             fix1             = _mm_macc_ps(dx12,fscal,fix1);
443             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
444             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
445
446             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
447             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
448             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
449
450             /**************************
451              * CALCULATE INTERACTIONS *
452              **************************/
453
454             r13              = _mm_mul_ps(rsq13,rinv13);
455
456             /* EWALD ELECTROSTATICS */
457
458             /* Analytical PME correction */
459             zeta2            = _mm_mul_ps(beta2,rsq13);
460             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
461             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
462             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
463             felec            = _mm_mul_ps(qq13,felec);
464             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
465             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv13);
466             velec            = _mm_mul_ps(qq13,velec);
467
468             /* Update potential sum for this i atom from the interaction with this j atom. */
469             velecsum         = _mm_add_ps(velecsum,velec);
470
471             fscal            = felec;
472
473              /* Update vectorial force */
474             fix1             = _mm_macc_ps(dx13,fscal,fix1);
475             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
476             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
477
478             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
479             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
480             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
481
482             /**************************
483              * CALCULATE INTERACTIONS *
484              **************************/
485
486             r21              = _mm_mul_ps(rsq21,rinv21);
487
488             /* EWALD ELECTROSTATICS */
489
490             /* Analytical PME correction */
491             zeta2            = _mm_mul_ps(beta2,rsq21);
492             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
493             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
494             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
495             felec            = _mm_mul_ps(qq21,felec);
496             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
497             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
498             velec            = _mm_mul_ps(qq21,velec);
499
500             /* Update potential sum for this i atom from the interaction with this j atom. */
501             velecsum         = _mm_add_ps(velecsum,velec);
502
503             fscal            = felec;
504
505              /* Update vectorial force */
506             fix2             = _mm_macc_ps(dx21,fscal,fix2);
507             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
508             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
509
510             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
511             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
512             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
513
514             /**************************
515              * CALCULATE INTERACTIONS *
516              **************************/
517
518             r22              = _mm_mul_ps(rsq22,rinv22);
519
520             /* EWALD ELECTROSTATICS */
521
522             /* Analytical PME correction */
523             zeta2            = _mm_mul_ps(beta2,rsq22);
524             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
525             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
526             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
527             felec            = _mm_mul_ps(qq22,felec);
528             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
529             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
530             velec            = _mm_mul_ps(qq22,velec);
531
532             /* Update potential sum for this i atom from the interaction with this j atom. */
533             velecsum         = _mm_add_ps(velecsum,velec);
534
535             fscal            = felec;
536
537              /* Update vectorial force */
538             fix2             = _mm_macc_ps(dx22,fscal,fix2);
539             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
540             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
541
542             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
543             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
544             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
545
546             /**************************
547              * CALCULATE INTERACTIONS *
548              **************************/
549
550             r23              = _mm_mul_ps(rsq23,rinv23);
551
552             /* EWALD ELECTROSTATICS */
553
554             /* Analytical PME correction */
555             zeta2            = _mm_mul_ps(beta2,rsq23);
556             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
557             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
558             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
559             felec            = _mm_mul_ps(qq23,felec);
560             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
561             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv23);
562             velec            = _mm_mul_ps(qq23,velec);
563
564             /* Update potential sum for this i atom from the interaction with this j atom. */
565             velecsum         = _mm_add_ps(velecsum,velec);
566
567             fscal            = felec;
568
569              /* Update vectorial force */
570             fix2             = _mm_macc_ps(dx23,fscal,fix2);
571             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
572             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
573
574             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
575             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
576             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
577
578             /**************************
579              * CALCULATE INTERACTIONS *
580              **************************/
581
582             r31              = _mm_mul_ps(rsq31,rinv31);
583
584             /* EWALD ELECTROSTATICS */
585
586             /* Analytical PME correction */
587             zeta2            = _mm_mul_ps(beta2,rsq31);
588             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
589             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
590             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
591             felec            = _mm_mul_ps(qq31,felec);
592             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
593             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv31);
594             velec            = _mm_mul_ps(qq31,velec);
595
596             /* Update potential sum for this i atom from the interaction with this j atom. */
597             velecsum         = _mm_add_ps(velecsum,velec);
598
599             fscal            = felec;
600
601              /* Update vectorial force */
602             fix3             = _mm_macc_ps(dx31,fscal,fix3);
603             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
604             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
605
606             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
607             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
608             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
609
610             /**************************
611              * CALCULATE INTERACTIONS *
612              **************************/
613
614             r32              = _mm_mul_ps(rsq32,rinv32);
615
616             /* EWALD ELECTROSTATICS */
617
618             /* Analytical PME correction */
619             zeta2            = _mm_mul_ps(beta2,rsq32);
620             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
621             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
622             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
623             felec            = _mm_mul_ps(qq32,felec);
624             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
625             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv32);
626             velec            = _mm_mul_ps(qq32,velec);
627
628             /* Update potential sum for this i atom from the interaction with this j atom. */
629             velecsum         = _mm_add_ps(velecsum,velec);
630
631             fscal            = felec;
632
633              /* Update vectorial force */
634             fix3             = _mm_macc_ps(dx32,fscal,fix3);
635             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
636             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
637
638             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
639             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
640             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
641
642             /**************************
643              * CALCULATE INTERACTIONS *
644              **************************/
645
646             r33              = _mm_mul_ps(rsq33,rinv33);
647
648             /* EWALD ELECTROSTATICS */
649
650             /* Analytical PME correction */
651             zeta2            = _mm_mul_ps(beta2,rsq33);
652             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
653             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
654             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
655             felec            = _mm_mul_ps(qq33,felec);
656             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
657             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv33);
658             velec            = _mm_mul_ps(qq33,velec);
659
660             /* Update potential sum for this i atom from the interaction with this j atom. */
661             velecsum         = _mm_add_ps(velecsum,velec);
662
663             fscal            = felec;
664
665              /* Update vectorial force */
666             fix3             = _mm_macc_ps(dx33,fscal,fix3);
667             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
668             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
669
670             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
671             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
672             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
673
674             fjptrA             = f+j_coord_offsetA;
675             fjptrB             = f+j_coord_offsetB;
676             fjptrC             = f+j_coord_offsetC;
677             fjptrD             = f+j_coord_offsetD;
678
679             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
680                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
681                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
682
683             /* Inner loop uses 323 flops */
684         }
685
686         if(jidx<j_index_end)
687         {
688
689             /* Get j neighbor index, and coordinate index */
690             jnrlistA         = jjnr[jidx];
691             jnrlistB         = jjnr[jidx+1];
692             jnrlistC         = jjnr[jidx+2];
693             jnrlistD         = jjnr[jidx+3];
694             /* Sign of each element will be negative for non-real atoms.
695              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
696              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
697              */
698             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
699             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
700             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
701             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
702             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
703             j_coord_offsetA  = DIM*jnrA;
704             j_coord_offsetB  = DIM*jnrB;
705             j_coord_offsetC  = DIM*jnrC;
706             j_coord_offsetD  = DIM*jnrD;
707
708             /* load j atom coordinates */
709             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
710                                               x+j_coord_offsetC,x+j_coord_offsetD,
711                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
712                                               &jy2,&jz2,&jx3,&jy3,&jz3);
713
714             /* Calculate displacement vector */
715             dx00             = _mm_sub_ps(ix0,jx0);
716             dy00             = _mm_sub_ps(iy0,jy0);
717             dz00             = _mm_sub_ps(iz0,jz0);
718             dx11             = _mm_sub_ps(ix1,jx1);
719             dy11             = _mm_sub_ps(iy1,jy1);
720             dz11             = _mm_sub_ps(iz1,jz1);
721             dx12             = _mm_sub_ps(ix1,jx2);
722             dy12             = _mm_sub_ps(iy1,jy2);
723             dz12             = _mm_sub_ps(iz1,jz2);
724             dx13             = _mm_sub_ps(ix1,jx3);
725             dy13             = _mm_sub_ps(iy1,jy3);
726             dz13             = _mm_sub_ps(iz1,jz3);
727             dx21             = _mm_sub_ps(ix2,jx1);
728             dy21             = _mm_sub_ps(iy2,jy1);
729             dz21             = _mm_sub_ps(iz2,jz1);
730             dx22             = _mm_sub_ps(ix2,jx2);
731             dy22             = _mm_sub_ps(iy2,jy2);
732             dz22             = _mm_sub_ps(iz2,jz2);
733             dx23             = _mm_sub_ps(ix2,jx3);
734             dy23             = _mm_sub_ps(iy2,jy3);
735             dz23             = _mm_sub_ps(iz2,jz3);
736             dx31             = _mm_sub_ps(ix3,jx1);
737             dy31             = _mm_sub_ps(iy3,jy1);
738             dz31             = _mm_sub_ps(iz3,jz1);
739             dx32             = _mm_sub_ps(ix3,jx2);
740             dy32             = _mm_sub_ps(iy3,jy2);
741             dz32             = _mm_sub_ps(iz3,jz2);
742             dx33             = _mm_sub_ps(ix3,jx3);
743             dy33             = _mm_sub_ps(iy3,jy3);
744             dz33             = _mm_sub_ps(iz3,jz3);
745
746             /* Calculate squared distance and things based on it */
747             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
748             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
749             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
750             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
751             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
752             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
753             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
754             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
755             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
756             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
757
758             rinv00           = gmx_mm_invsqrt_ps(rsq00);
759             rinv11           = gmx_mm_invsqrt_ps(rsq11);
760             rinv12           = gmx_mm_invsqrt_ps(rsq12);
761             rinv13           = gmx_mm_invsqrt_ps(rsq13);
762             rinv21           = gmx_mm_invsqrt_ps(rsq21);
763             rinv22           = gmx_mm_invsqrt_ps(rsq22);
764             rinv23           = gmx_mm_invsqrt_ps(rsq23);
765             rinv31           = gmx_mm_invsqrt_ps(rsq31);
766             rinv32           = gmx_mm_invsqrt_ps(rsq32);
767             rinv33           = gmx_mm_invsqrt_ps(rsq33);
768
769             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
770             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
771             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
772             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
773             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
774             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
775             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
776             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
777             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
778
779             fjx0             = _mm_setzero_ps();
780             fjy0             = _mm_setzero_ps();
781             fjz0             = _mm_setzero_ps();
782             fjx1             = _mm_setzero_ps();
783             fjy1             = _mm_setzero_ps();
784             fjz1             = _mm_setzero_ps();
785             fjx2             = _mm_setzero_ps();
786             fjy2             = _mm_setzero_ps();
787             fjz2             = _mm_setzero_ps();
788             fjx3             = _mm_setzero_ps();
789             fjy3             = _mm_setzero_ps();
790             fjz3             = _mm_setzero_ps();
791
792             /**************************
793              * CALCULATE INTERACTIONS *
794              **************************/
795
796             r00              = _mm_mul_ps(rsq00,rinv00);
797             r00              = _mm_andnot_ps(dummy_mask,r00);
798
799             /* Calculate table index by multiplying r with table scale and truncate to integer */
800             rt               = _mm_mul_ps(r00,vftabscale);
801             vfitab           = _mm_cvttps_epi32(rt);
802 #ifdef __XOP__
803             vfeps            = _mm_frcz_ps(rt);
804 #else
805             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
806 #endif
807             twovfeps         = _mm_add_ps(vfeps,vfeps);
808             vfitab           = _mm_slli_epi32(vfitab,3);
809
810             /* CUBIC SPLINE TABLE DISPERSION */
811             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
812             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
813             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
814             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
815             _MM_TRANSPOSE4_PS(Y,F,G,H);
816             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
817             VV               = _mm_macc_ps(vfeps,Fp,Y);
818             vvdw6            = _mm_mul_ps(c6_00,VV);
819             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
820             fvdw6            = _mm_mul_ps(c6_00,FF);
821
822             /* CUBIC SPLINE TABLE REPULSION */
823             vfitab           = _mm_add_epi32(vfitab,ifour);
824             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
825             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
826             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
827             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
828             _MM_TRANSPOSE4_PS(Y,F,G,H);
829             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
830             VV               = _mm_macc_ps(vfeps,Fp,Y);
831             vvdw12           = _mm_mul_ps(c12_00,VV);
832             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
833             fvdw12           = _mm_mul_ps(c12_00,FF);
834             vvdw             = _mm_add_ps(vvdw12,vvdw6);
835             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
836
837             /* Update potential sum for this i atom from the interaction with this j atom. */
838             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
839             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
840
841             fscal            = fvdw;
842
843             fscal            = _mm_andnot_ps(dummy_mask,fscal);
844
845              /* Update vectorial force */
846             fix0             = _mm_macc_ps(dx00,fscal,fix0);
847             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
848             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
849
850             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
851             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
852             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
853
854             /**************************
855              * CALCULATE INTERACTIONS *
856              **************************/
857
858             r11              = _mm_mul_ps(rsq11,rinv11);
859             r11              = _mm_andnot_ps(dummy_mask,r11);
860
861             /* EWALD ELECTROSTATICS */
862
863             /* Analytical PME correction */
864             zeta2            = _mm_mul_ps(beta2,rsq11);
865             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
866             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
867             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
868             felec            = _mm_mul_ps(qq11,felec);
869             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
870             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
871             velec            = _mm_mul_ps(qq11,velec);
872
873             /* Update potential sum for this i atom from the interaction with this j atom. */
874             velec            = _mm_andnot_ps(dummy_mask,velec);
875             velecsum         = _mm_add_ps(velecsum,velec);
876
877             fscal            = felec;
878
879             fscal            = _mm_andnot_ps(dummy_mask,fscal);
880
881              /* Update vectorial force */
882             fix1             = _mm_macc_ps(dx11,fscal,fix1);
883             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
884             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
885
886             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
887             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
888             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
889
890             /**************************
891              * CALCULATE INTERACTIONS *
892              **************************/
893
894             r12              = _mm_mul_ps(rsq12,rinv12);
895             r12              = _mm_andnot_ps(dummy_mask,r12);
896
897             /* EWALD ELECTROSTATICS */
898
899             /* Analytical PME correction */
900             zeta2            = _mm_mul_ps(beta2,rsq12);
901             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
902             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
903             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
904             felec            = _mm_mul_ps(qq12,felec);
905             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
906             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
907             velec            = _mm_mul_ps(qq12,velec);
908
909             /* Update potential sum for this i atom from the interaction with this j atom. */
910             velec            = _mm_andnot_ps(dummy_mask,velec);
911             velecsum         = _mm_add_ps(velecsum,velec);
912
913             fscal            = felec;
914
915             fscal            = _mm_andnot_ps(dummy_mask,fscal);
916
917              /* Update vectorial force */
918             fix1             = _mm_macc_ps(dx12,fscal,fix1);
919             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
920             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
921
922             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
923             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
924             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
925
926             /**************************
927              * CALCULATE INTERACTIONS *
928              **************************/
929
930             r13              = _mm_mul_ps(rsq13,rinv13);
931             r13              = _mm_andnot_ps(dummy_mask,r13);
932
933             /* EWALD ELECTROSTATICS */
934
935             /* Analytical PME correction */
936             zeta2            = _mm_mul_ps(beta2,rsq13);
937             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
938             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
939             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
940             felec            = _mm_mul_ps(qq13,felec);
941             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
942             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv13);
943             velec            = _mm_mul_ps(qq13,velec);
944
945             /* Update potential sum for this i atom from the interaction with this j atom. */
946             velec            = _mm_andnot_ps(dummy_mask,velec);
947             velecsum         = _mm_add_ps(velecsum,velec);
948
949             fscal            = felec;
950
951             fscal            = _mm_andnot_ps(dummy_mask,fscal);
952
953              /* Update vectorial force */
954             fix1             = _mm_macc_ps(dx13,fscal,fix1);
955             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
956             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
957
958             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
959             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
960             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
961
962             /**************************
963              * CALCULATE INTERACTIONS *
964              **************************/
965
966             r21              = _mm_mul_ps(rsq21,rinv21);
967             r21              = _mm_andnot_ps(dummy_mask,r21);
968
969             /* EWALD ELECTROSTATICS */
970
971             /* Analytical PME correction */
972             zeta2            = _mm_mul_ps(beta2,rsq21);
973             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
974             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
975             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
976             felec            = _mm_mul_ps(qq21,felec);
977             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
978             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
979             velec            = _mm_mul_ps(qq21,velec);
980
981             /* Update potential sum for this i atom from the interaction with this j atom. */
982             velec            = _mm_andnot_ps(dummy_mask,velec);
983             velecsum         = _mm_add_ps(velecsum,velec);
984
985             fscal            = felec;
986
987             fscal            = _mm_andnot_ps(dummy_mask,fscal);
988
989              /* Update vectorial force */
990             fix2             = _mm_macc_ps(dx21,fscal,fix2);
991             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
992             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
993
994             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
995             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
996             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
997
998             /**************************
999              * CALCULATE INTERACTIONS *
1000              **************************/
1001
1002             r22              = _mm_mul_ps(rsq22,rinv22);
1003             r22              = _mm_andnot_ps(dummy_mask,r22);
1004
1005             /* EWALD ELECTROSTATICS */
1006
1007             /* Analytical PME correction */
1008             zeta2            = _mm_mul_ps(beta2,rsq22);
1009             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1010             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1011             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1012             felec            = _mm_mul_ps(qq22,felec);
1013             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1014             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1015             velec            = _mm_mul_ps(qq22,velec);
1016
1017             /* Update potential sum for this i atom from the interaction with this j atom. */
1018             velec            = _mm_andnot_ps(dummy_mask,velec);
1019             velecsum         = _mm_add_ps(velecsum,velec);
1020
1021             fscal            = felec;
1022
1023             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1024
1025              /* Update vectorial force */
1026             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1027             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1028             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1029
1030             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1031             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1032             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1033
1034             /**************************
1035              * CALCULATE INTERACTIONS *
1036              **************************/
1037
1038             r23              = _mm_mul_ps(rsq23,rinv23);
1039             r23              = _mm_andnot_ps(dummy_mask,r23);
1040
1041             /* EWALD ELECTROSTATICS */
1042
1043             /* Analytical PME correction */
1044             zeta2            = _mm_mul_ps(beta2,rsq23);
1045             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
1046             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1047             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1048             felec            = _mm_mul_ps(qq23,felec);
1049             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1050             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv23);
1051             velec            = _mm_mul_ps(qq23,velec);
1052
1053             /* Update potential sum for this i atom from the interaction with this j atom. */
1054             velec            = _mm_andnot_ps(dummy_mask,velec);
1055             velecsum         = _mm_add_ps(velecsum,velec);
1056
1057             fscal            = felec;
1058
1059             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1060
1061              /* Update vectorial force */
1062             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1063             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1064             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1065
1066             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1067             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1068             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1069
1070             /**************************
1071              * CALCULATE INTERACTIONS *
1072              **************************/
1073
1074             r31              = _mm_mul_ps(rsq31,rinv31);
1075             r31              = _mm_andnot_ps(dummy_mask,r31);
1076
1077             /* EWALD ELECTROSTATICS */
1078
1079             /* Analytical PME correction */
1080             zeta2            = _mm_mul_ps(beta2,rsq31);
1081             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
1082             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1083             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1084             felec            = _mm_mul_ps(qq31,felec);
1085             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1086             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv31);
1087             velec            = _mm_mul_ps(qq31,velec);
1088
1089             /* Update potential sum for this i atom from the interaction with this j atom. */
1090             velec            = _mm_andnot_ps(dummy_mask,velec);
1091             velecsum         = _mm_add_ps(velecsum,velec);
1092
1093             fscal            = felec;
1094
1095             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1096
1097              /* Update vectorial force */
1098             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1099             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1100             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1101
1102             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1103             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1104             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1105
1106             /**************************
1107              * CALCULATE INTERACTIONS *
1108              **************************/
1109
1110             r32              = _mm_mul_ps(rsq32,rinv32);
1111             r32              = _mm_andnot_ps(dummy_mask,r32);
1112
1113             /* EWALD ELECTROSTATICS */
1114
1115             /* Analytical PME correction */
1116             zeta2            = _mm_mul_ps(beta2,rsq32);
1117             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
1118             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1119             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1120             felec            = _mm_mul_ps(qq32,felec);
1121             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1122             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv32);
1123             velec            = _mm_mul_ps(qq32,velec);
1124
1125             /* Update potential sum for this i atom from the interaction with this j atom. */
1126             velec            = _mm_andnot_ps(dummy_mask,velec);
1127             velecsum         = _mm_add_ps(velecsum,velec);
1128
1129             fscal            = felec;
1130
1131             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1132
1133              /* Update vectorial force */
1134             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1135             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1136             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1137
1138             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1139             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1140             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1141
1142             /**************************
1143              * CALCULATE INTERACTIONS *
1144              **************************/
1145
1146             r33              = _mm_mul_ps(rsq33,rinv33);
1147             r33              = _mm_andnot_ps(dummy_mask,r33);
1148
1149             /* EWALD ELECTROSTATICS */
1150
1151             /* Analytical PME correction */
1152             zeta2            = _mm_mul_ps(beta2,rsq33);
1153             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
1154             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1155             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1156             felec            = _mm_mul_ps(qq33,felec);
1157             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1158             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv33);
1159             velec            = _mm_mul_ps(qq33,velec);
1160
1161             /* Update potential sum for this i atom from the interaction with this j atom. */
1162             velec            = _mm_andnot_ps(dummy_mask,velec);
1163             velecsum         = _mm_add_ps(velecsum,velec);
1164
1165             fscal            = felec;
1166
1167             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1168
1169              /* Update vectorial force */
1170             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1171             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1172             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1173
1174             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1175             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1176             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1177
1178             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1179             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1180             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1181             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1182
1183             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1184                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1185                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1186
1187             /* Inner loop uses 333 flops */
1188         }
1189
1190         /* End of innermost loop */
1191
1192         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1193                                               f+i_coord_offset,fshift+i_shift_offset);
1194
1195         ggid                        = gid[iidx];
1196         /* Update potential energies */
1197         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1198         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1199
1200         /* Increment number of inner iterations */
1201         inneriter                  += j_index_end - j_index_start;
1202
1203         /* Outer loop uses 26 flops */
1204     }
1205
1206     /* Increment number of outer iterations */
1207     outeriter        += nri;
1208
1209     /* Update outer/inner flops */
1210
1211     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*333);
1212 }
1213 /*
1214  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_128_fma_single
1215  * Electrostatics interaction: Ewald
1216  * VdW interaction:            CubicSplineTable
1217  * Geometry:                   Water4-Water4
1218  * Calculate force/pot:        Force
1219  */
1220 void
1221 nb_kernel_ElecEw_VdwCSTab_GeomW4W4_F_avx_128_fma_single
1222                     (t_nblist                    * gmx_restrict       nlist,
1223                      rvec                        * gmx_restrict          xx,
1224                      rvec                        * gmx_restrict          ff,
1225                      t_forcerec                  * gmx_restrict          fr,
1226                      t_mdatoms                   * gmx_restrict     mdatoms,
1227                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1228                      t_nrnb                      * gmx_restrict        nrnb)
1229 {
1230     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1231      * just 0 for non-waters.
1232      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1233      * jnr indices corresponding to data put in the four positions in the SIMD register.
1234      */
1235     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1236     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1237     int              jnrA,jnrB,jnrC,jnrD;
1238     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1239     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1240     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1241     real             rcutoff_scalar;
1242     real             *shiftvec,*fshift,*x,*f;
1243     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1244     real             scratch[4*DIM];
1245     __m128           fscal,rcutoff,rcutoff2,jidxall;
1246     int              vdwioffset0;
1247     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1248     int              vdwioffset1;
1249     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1250     int              vdwioffset2;
1251     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1252     int              vdwioffset3;
1253     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1254     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1255     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1256     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1257     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1258     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1259     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1260     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1261     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1262     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1263     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1264     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1265     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1266     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1267     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1268     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1269     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1270     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1271     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1272     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1273     real             *charge;
1274     int              nvdwtype;
1275     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1276     int              *vdwtype;
1277     real             *vdwparam;
1278     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1279     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1280     __m128i          vfitab;
1281     __m128i          ifour       = _mm_set1_epi32(4);
1282     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1283     real             *vftab;
1284     __m128i          ewitab;
1285     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1286     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1287     real             *ewtab;
1288     __m128           dummy_mask,cutoff_mask;
1289     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1290     __m128           one     = _mm_set1_ps(1.0);
1291     __m128           two     = _mm_set1_ps(2.0);
1292     x                = xx[0];
1293     f                = ff[0];
1294
1295     nri              = nlist->nri;
1296     iinr             = nlist->iinr;
1297     jindex           = nlist->jindex;
1298     jjnr             = nlist->jjnr;
1299     shiftidx         = nlist->shift;
1300     gid              = nlist->gid;
1301     shiftvec         = fr->shift_vec[0];
1302     fshift           = fr->fshift[0];
1303     facel            = _mm_set1_ps(fr->epsfac);
1304     charge           = mdatoms->chargeA;
1305     nvdwtype         = fr->ntype;
1306     vdwparam         = fr->nbfp;
1307     vdwtype          = mdatoms->typeA;
1308
1309     vftab            = kernel_data->table_vdw->data;
1310     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
1311
1312     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1313     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1314     beta2            = _mm_mul_ps(beta,beta);
1315     beta3            = _mm_mul_ps(beta,beta2);
1316     ewtab            = fr->ic->tabq_coul_F;
1317     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1318     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1319
1320     /* Setup water-specific parameters */
1321     inr              = nlist->iinr[0];
1322     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1323     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1324     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1325     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1326
1327     jq1              = _mm_set1_ps(charge[inr+1]);
1328     jq2              = _mm_set1_ps(charge[inr+2]);
1329     jq3              = _mm_set1_ps(charge[inr+3]);
1330     vdwjidx0A        = 2*vdwtype[inr+0];
1331     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1332     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1333     qq11             = _mm_mul_ps(iq1,jq1);
1334     qq12             = _mm_mul_ps(iq1,jq2);
1335     qq13             = _mm_mul_ps(iq1,jq3);
1336     qq21             = _mm_mul_ps(iq2,jq1);
1337     qq22             = _mm_mul_ps(iq2,jq2);
1338     qq23             = _mm_mul_ps(iq2,jq3);
1339     qq31             = _mm_mul_ps(iq3,jq1);
1340     qq32             = _mm_mul_ps(iq3,jq2);
1341     qq33             = _mm_mul_ps(iq3,jq3);
1342
1343     /* Avoid stupid compiler warnings */
1344     jnrA = jnrB = jnrC = jnrD = 0;
1345     j_coord_offsetA = 0;
1346     j_coord_offsetB = 0;
1347     j_coord_offsetC = 0;
1348     j_coord_offsetD = 0;
1349
1350     outeriter        = 0;
1351     inneriter        = 0;
1352
1353     for(iidx=0;iidx<4*DIM;iidx++)
1354     {
1355         scratch[iidx] = 0.0;
1356     }
1357
1358     /* Start outer loop over neighborlists */
1359     for(iidx=0; iidx<nri; iidx++)
1360     {
1361         /* Load shift vector for this list */
1362         i_shift_offset   = DIM*shiftidx[iidx];
1363
1364         /* Load limits for loop over neighbors */
1365         j_index_start    = jindex[iidx];
1366         j_index_end      = jindex[iidx+1];
1367
1368         /* Get outer coordinate index */
1369         inr              = iinr[iidx];
1370         i_coord_offset   = DIM*inr;
1371
1372         /* Load i particle coords and add shift vector */
1373         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1374                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1375
1376         fix0             = _mm_setzero_ps();
1377         fiy0             = _mm_setzero_ps();
1378         fiz0             = _mm_setzero_ps();
1379         fix1             = _mm_setzero_ps();
1380         fiy1             = _mm_setzero_ps();
1381         fiz1             = _mm_setzero_ps();
1382         fix2             = _mm_setzero_ps();
1383         fiy2             = _mm_setzero_ps();
1384         fiz2             = _mm_setzero_ps();
1385         fix3             = _mm_setzero_ps();
1386         fiy3             = _mm_setzero_ps();
1387         fiz3             = _mm_setzero_ps();
1388
1389         /* Start inner kernel loop */
1390         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1391         {
1392
1393             /* Get j neighbor index, and coordinate index */
1394             jnrA             = jjnr[jidx];
1395             jnrB             = jjnr[jidx+1];
1396             jnrC             = jjnr[jidx+2];
1397             jnrD             = jjnr[jidx+3];
1398             j_coord_offsetA  = DIM*jnrA;
1399             j_coord_offsetB  = DIM*jnrB;
1400             j_coord_offsetC  = DIM*jnrC;
1401             j_coord_offsetD  = DIM*jnrD;
1402
1403             /* load j atom coordinates */
1404             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1405                                               x+j_coord_offsetC,x+j_coord_offsetD,
1406                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1407                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1408
1409             /* Calculate displacement vector */
1410             dx00             = _mm_sub_ps(ix0,jx0);
1411             dy00             = _mm_sub_ps(iy0,jy0);
1412             dz00             = _mm_sub_ps(iz0,jz0);
1413             dx11             = _mm_sub_ps(ix1,jx1);
1414             dy11             = _mm_sub_ps(iy1,jy1);
1415             dz11             = _mm_sub_ps(iz1,jz1);
1416             dx12             = _mm_sub_ps(ix1,jx2);
1417             dy12             = _mm_sub_ps(iy1,jy2);
1418             dz12             = _mm_sub_ps(iz1,jz2);
1419             dx13             = _mm_sub_ps(ix1,jx3);
1420             dy13             = _mm_sub_ps(iy1,jy3);
1421             dz13             = _mm_sub_ps(iz1,jz3);
1422             dx21             = _mm_sub_ps(ix2,jx1);
1423             dy21             = _mm_sub_ps(iy2,jy1);
1424             dz21             = _mm_sub_ps(iz2,jz1);
1425             dx22             = _mm_sub_ps(ix2,jx2);
1426             dy22             = _mm_sub_ps(iy2,jy2);
1427             dz22             = _mm_sub_ps(iz2,jz2);
1428             dx23             = _mm_sub_ps(ix2,jx3);
1429             dy23             = _mm_sub_ps(iy2,jy3);
1430             dz23             = _mm_sub_ps(iz2,jz3);
1431             dx31             = _mm_sub_ps(ix3,jx1);
1432             dy31             = _mm_sub_ps(iy3,jy1);
1433             dz31             = _mm_sub_ps(iz3,jz1);
1434             dx32             = _mm_sub_ps(ix3,jx2);
1435             dy32             = _mm_sub_ps(iy3,jy2);
1436             dz32             = _mm_sub_ps(iz3,jz2);
1437             dx33             = _mm_sub_ps(ix3,jx3);
1438             dy33             = _mm_sub_ps(iy3,jy3);
1439             dz33             = _mm_sub_ps(iz3,jz3);
1440
1441             /* Calculate squared distance and things based on it */
1442             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1443             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1444             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1445             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1446             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1447             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1448             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1449             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1450             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1451             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1452
1453             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1454             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1455             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1456             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1457             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1458             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1459             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1460             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1461             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1462             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1463
1464             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1465             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1466             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1467             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1468             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1469             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1470             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1471             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1472             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1473
1474             fjx0             = _mm_setzero_ps();
1475             fjy0             = _mm_setzero_ps();
1476             fjz0             = _mm_setzero_ps();
1477             fjx1             = _mm_setzero_ps();
1478             fjy1             = _mm_setzero_ps();
1479             fjz1             = _mm_setzero_ps();
1480             fjx2             = _mm_setzero_ps();
1481             fjy2             = _mm_setzero_ps();
1482             fjz2             = _mm_setzero_ps();
1483             fjx3             = _mm_setzero_ps();
1484             fjy3             = _mm_setzero_ps();
1485             fjz3             = _mm_setzero_ps();
1486
1487             /**************************
1488              * CALCULATE INTERACTIONS *
1489              **************************/
1490
1491             r00              = _mm_mul_ps(rsq00,rinv00);
1492
1493             /* Calculate table index by multiplying r with table scale and truncate to integer */
1494             rt               = _mm_mul_ps(r00,vftabscale);
1495             vfitab           = _mm_cvttps_epi32(rt);
1496 #ifdef __XOP__
1497             vfeps            = _mm_frcz_ps(rt);
1498 #else
1499             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1500 #endif
1501             twovfeps         = _mm_add_ps(vfeps,vfeps);
1502             vfitab           = _mm_slli_epi32(vfitab,3);
1503
1504             /* CUBIC SPLINE TABLE DISPERSION */
1505             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1506             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1507             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1508             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1509             _MM_TRANSPOSE4_PS(Y,F,G,H);
1510             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1511             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1512             fvdw6            = _mm_mul_ps(c6_00,FF);
1513
1514             /* CUBIC SPLINE TABLE REPULSION */
1515             vfitab           = _mm_add_epi32(vfitab,ifour);
1516             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1517             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1518             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1519             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1520             _MM_TRANSPOSE4_PS(Y,F,G,H);
1521             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1522             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1523             fvdw12           = _mm_mul_ps(c12_00,FF);
1524             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1525
1526             fscal            = fvdw;
1527
1528              /* Update vectorial force */
1529             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1530             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1531             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1532
1533             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1534             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1535             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1536
1537             /**************************
1538              * CALCULATE INTERACTIONS *
1539              **************************/
1540
1541             r11              = _mm_mul_ps(rsq11,rinv11);
1542
1543             /* EWALD ELECTROSTATICS */
1544
1545             /* Analytical PME correction */
1546             zeta2            = _mm_mul_ps(beta2,rsq11);
1547             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1548             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1549             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1550             felec            = _mm_mul_ps(qq11,felec);
1551
1552             fscal            = felec;
1553
1554              /* Update vectorial force */
1555             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1556             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1557             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1558
1559             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1560             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1561             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1562
1563             /**************************
1564              * CALCULATE INTERACTIONS *
1565              **************************/
1566
1567             r12              = _mm_mul_ps(rsq12,rinv12);
1568
1569             /* EWALD ELECTROSTATICS */
1570
1571             /* Analytical PME correction */
1572             zeta2            = _mm_mul_ps(beta2,rsq12);
1573             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1574             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1575             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1576             felec            = _mm_mul_ps(qq12,felec);
1577
1578             fscal            = felec;
1579
1580              /* Update vectorial force */
1581             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1582             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1583             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1584
1585             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1586             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1587             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1588
1589             /**************************
1590              * CALCULATE INTERACTIONS *
1591              **************************/
1592
1593             r13              = _mm_mul_ps(rsq13,rinv13);
1594
1595             /* EWALD ELECTROSTATICS */
1596
1597             /* Analytical PME correction */
1598             zeta2            = _mm_mul_ps(beta2,rsq13);
1599             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
1600             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1601             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1602             felec            = _mm_mul_ps(qq13,felec);
1603
1604             fscal            = felec;
1605
1606              /* Update vectorial force */
1607             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1608             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1609             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1610
1611             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1612             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1613             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1614
1615             /**************************
1616              * CALCULATE INTERACTIONS *
1617              **************************/
1618
1619             r21              = _mm_mul_ps(rsq21,rinv21);
1620
1621             /* EWALD ELECTROSTATICS */
1622
1623             /* Analytical PME correction */
1624             zeta2            = _mm_mul_ps(beta2,rsq21);
1625             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1626             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1627             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1628             felec            = _mm_mul_ps(qq21,felec);
1629
1630             fscal            = felec;
1631
1632              /* Update vectorial force */
1633             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1634             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1635             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1636
1637             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1638             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1639             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1640
1641             /**************************
1642              * CALCULATE INTERACTIONS *
1643              **************************/
1644
1645             r22              = _mm_mul_ps(rsq22,rinv22);
1646
1647             /* EWALD ELECTROSTATICS */
1648
1649             /* Analytical PME correction */
1650             zeta2            = _mm_mul_ps(beta2,rsq22);
1651             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1652             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1653             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1654             felec            = _mm_mul_ps(qq22,felec);
1655
1656             fscal            = felec;
1657
1658              /* Update vectorial force */
1659             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1660             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1661             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1662
1663             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1664             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1665             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1666
1667             /**************************
1668              * CALCULATE INTERACTIONS *
1669              **************************/
1670
1671             r23              = _mm_mul_ps(rsq23,rinv23);
1672
1673             /* EWALD ELECTROSTATICS */
1674
1675             /* Analytical PME correction */
1676             zeta2            = _mm_mul_ps(beta2,rsq23);
1677             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
1678             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1679             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1680             felec            = _mm_mul_ps(qq23,felec);
1681
1682             fscal            = felec;
1683
1684              /* Update vectorial force */
1685             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1686             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1687             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1688
1689             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1690             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1691             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1692
1693             /**************************
1694              * CALCULATE INTERACTIONS *
1695              **************************/
1696
1697             r31              = _mm_mul_ps(rsq31,rinv31);
1698
1699             /* EWALD ELECTROSTATICS */
1700
1701             /* Analytical PME correction */
1702             zeta2            = _mm_mul_ps(beta2,rsq31);
1703             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
1704             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1705             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1706             felec            = _mm_mul_ps(qq31,felec);
1707
1708             fscal            = felec;
1709
1710              /* Update vectorial force */
1711             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1712             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1713             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1714
1715             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1716             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1717             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1718
1719             /**************************
1720              * CALCULATE INTERACTIONS *
1721              **************************/
1722
1723             r32              = _mm_mul_ps(rsq32,rinv32);
1724
1725             /* EWALD ELECTROSTATICS */
1726
1727             /* Analytical PME correction */
1728             zeta2            = _mm_mul_ps(beta2,rsq32);
1729             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
1730             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1731             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1732             felec            = _mm_mul_ps(qq32,felec);
1733
1734             fscal            = felec;
1735
1736              /* Update vectorial force */
1737             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1738             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1739             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1740
1741             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1742             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1743             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1744
1745             /**************************
1746              * CALCULATE INTERACTIONS *
1747              **************************/
1748
1749             r33              = _mm_mul_ps(rsq33,rinv33);
1750
1751             /* EWALD ELECTROSTATICS */
1752
1753             /* Analytical PME correction */
1754             zeta2            = _mm_mul_ps(beta2,rsq33);
1755             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
1756             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1757             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1758             felec            = _mm_mul_ps(qq33,felec);
1759
1760             fscal            = felec;
1761
1762              /* Update vectorial force */
1763             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1764             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1765             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1766
1767             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1768             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1769             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1770
1771             fjptrA             = f+j_coord_offsetA;
1772             fjptrB             = f+j_coord_offsetB;
1773             fjptrC             = f+j_coord_offsetC;
1774             fjptrD             = f+j_coord_offsetD;
1775
1776             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1777                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1778                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1779
1780             /* Inner loop uses 306 flops */
1781         }
1782
1783         if(jidx<j_index_end)
1784         {
1785
1786             /* Get j neighbor index, and coordinate index */
1787             jnrlistA         = jjnr[jidx];
1788             jnrlistB         = jjnr[jidx+1];
1789             jnrlistC         = jjnr[jidx+2];
1790             jnrlistD         = jjnr[jidx+3];
1791             /* Sign of each element will be negative for non-real atoms.
1792              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1793              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1794              */
1795             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1796             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1797             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1798             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1799             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1800             j_coord_offsetA  = DIM*jnrA;
1801             j_coord_offsetB  = DIM*jnrB;
1802             j_coord_offsetC  = DIM*jnrC;
1803             j_coord_offsetD  = DIM*jnrD;
1804
1805             /* load j atom coordinates */
1806             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1807                                               x+j_coord_offsetC,x+j_coord_offsetD,
1808                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1809                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1810
1811             /* Calculate displacement vector */
1812             dx00             = _mm_sub_ps(ix0,jx0);
1813             dy00             = _mm_sub_ps(iy0,jy0);
1814             dz00             = _mm_sub_ps(iz0,jz0);
1815             dx11             = _mm_sub_ps(ix1,jx1);
1816             dy11             = _mm_sub_ps(iy1,jy1);
1817             dz11             = _mm_sub_ps(iz1,jz1);
1818             dx12             = _mm_sub_ps(ix1,jx2);
1819             dy12             = _mm_sub_ps(iy1,jy2);
1820             dz12             = _mm_sub_ps(iz1,jz2);
1821             dx13             = _mm_sub_ps(ix1,jx3);
1822             dy13             = _mm_sub_ps(iy1,jy3);
1823             dz13             = _mm_sub_ps(iz1,jz3);
1824             dx21             = _mm_sub_ps(ix2,jx1);
1825             dy21             = _mm_sub_ps(iy2,jy1);
1826             dz21             = _mm_sub_ps(iz2,jz1);
1827             dx22             = _mm_sub_ps(ix2,jx2);
1828             dy22             = _mm_sub_ps(iy2,jy2);
1829             dz22             = _mm_sub_ps(iz2,jz2);
1830             dx23             = _mm_sub_ps(ix2,jx3);
1831             dy23             = _mm_sub_ps(iy2,jy3);
1832             dz23             = _mm_sub_ps(iz2,jz3);
1833             dx31             = _mm_sub_ps(ix3,jx1);
1834             dy31             = _mm_sub_ps(iy3,jy1);
1835             dz31             = _mm_sub_ps(iz3,jz1);
1836             dx32             = _mm_sub_ps(ix3,jx2);
1837             dy32             = _mm_sub_ps(iy3,jy2);
1838             dz32             = _mm_sub_ps(iz3,jz2);
1839             dx33             = _mm_sub_ps(ix3,jx3);
1840             dy33             = _mm_sub_ps(iy3,jy3);
1841             dz33             = _mm_sub_ps(iz3,jz3);
1842
1843             /* Calculate squared distance and things based on it */
1844             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1845             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1846             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1847             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1848             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1849             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1850             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1851             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1852             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1853             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1854
1855             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1856             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1857             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1858             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1859             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1860             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1861             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1862             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1863             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1864             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1865
1866             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1867             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1868             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1869             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1870             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1871             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1872             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1873             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1874             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1875
1876             fjx0             = _mm_setzero_ps();
1877             fjy0             = _mm_setzero_ps();
1878             fjz0             = _mm_setzero_ps();
1879             fjx1             = _mm_setzero_ps();
1880             fjy1             = _mm_setzero_ps();
1881             fjz1             = _mm_setzero_ps();
1882             fjx2             = _mm_setzero_ps();
1883             fjy2             = _mm_setzero_ps();
1884             fjz2             = _mm_setzero_ps();
1885             fjx3             = _mm_setzero_ps();
1886             fjy3             = _mm_setzero_ps();
1887             fjz3             = _mm_setzero_ps();
1888
1889             /**************************
1890              * CALCULATE INTERACTIONS *
1891              **************************/
1892
1893             r00              = _mm_mul_ps(rsq00,rinv00);
1894             r00              = _mm_andnot_ps(dummy_mask,r00);
1895
1896             /* Calculate table index by multiplying r with table scale and truncate to integer */
1897             rt               = _mm_mul_ps(r00,vftabscale);
1898             vfitab           = _mm_cvttps_epi32(rt);
1899 #ifdef __XOP__
1900             vfeps            = _mm_frcz_ps(rt);
1901 #else
1902             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1903 #endif
1904             twovfeps         = _mm_add_ps(vfeps,vfeps);
1905             vfitab           = _mm_slli_epi32(vfitab,3);
1906
1907             /* CUBIC SPLINE TABLE DISPERSION */
1908             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1909             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1910             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1911             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1912             _MM_TRANSPOSE4_PS(Y,F,G,H);
1913             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1914             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1915             fvdw6            = _mm_mul_ps(c6_00,FF);
1916
1917             /* CUBIC SPLINE TABLE REPULSION */
1918             vfitab           = _mm_add_epi32(vfitab,ifour);
1919             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1920             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1921             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1922             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1923             _MM_TRANSPOSE4_PS(Y,F,G,H);
1924             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1925             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1926             fvdw12           = _mm_mul_ps(c12_00,FF);
1927             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1928
1929             fscal            = fvdw;
1930
1931             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1932
1933              /* Update vectorial force */
1934             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1935             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1936             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1937
1938             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1939             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1940             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1941
1942             /**************************
1943              * CALCULATE INTERACTIONS *
1944              **************************/
1945
1946             r11              = _mm_mul_ps(rsq11,rinv11);
1947             r11              = _mm_andnot_ps(dummy_mask,r11);
1948
1949             /* EWALD ELECTROSTATICS */
1950
1951             /* Analytical PME correction */
1952             zeta2            = _mm_mul_ps(beta2,rsq11);
1953             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1954             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1955             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1956             felec            = _mm_mul_ps(qq11,felec);
1957
1958             fscal            = felec;
1959
1960             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1961
1962              /* Update vectorial force */
1963             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1964             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1965             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1966
1967             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1968             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1969             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1970
1971             /**************************
1972              * CALCULATE INTERACTIONS *
1973              **************************/
1974
1975             r12              = _mm_mul_ps(rsq12,rinv12);
1976             r12              = _mm_andnot_ps(dummy_mask,r12);
1977
1978             /* EWALD ELECTROSTATICS */
1979
1980             /* Analytical PME correction */
1981             zeta2            = _mm_mul_ps(beta2,rsq12);
1982             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1983             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1984             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1985             felec            = _mm_mul_ps(qq12,felec);
1986
1987             fscal            = felec;
1988
1989             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1990
1991              /* Update vectorial force */
1992             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1993             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1994             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1995
1996             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1997             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1998             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1999
2000             /**************************
2001              * CALCULATE INTERACTIONS *
2002              **************************/
2003
2004             r13              = _mm_mul_ps(rsq13,rinv13);
2005             r13              = _mm_andnot_ps(dummy_mask,r13);
2006
2007             /* EWALD ELECTROSTATICS */
2008
2009             /* Analytical PME correction */
2010             zeta2            = _mm_mul_ps(beta2,rsq13);
2011             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
2012             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2013             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2014             felec            = _mm_mul_ps(qq13,felec);
2015
2016             fscal            = felec;
2017
2018             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2019
2020              /* Update vectorial force */
2021             fix1             = _mm_macc_ps(dx13,fscal,fix1);
2022             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
2023             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
2024
2025             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
2026             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
2027             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
2028
2029             /**************************
2030              * CALCULATE INTERACTIONS *
2031              **************************/
2032
2033             r21              = _mm_mul_ps(rsq21,rinv21);
2034             r21              = _mm_andnot_ps(dummy_mask,r21);
2035
2036             /* EWALD ELECTROSTATICS */
2037
2038             /* Analytical PME correction */
2039             zeta2            = _mm_mul_ps(beta2,rsq21);
2040             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
2041             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2042             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2043             felec            = _mm_mul_ps(qq21,felec);
2044
2045             fscal            = felec;
2046
2047             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2048
2049              /* Update vectorial force */
2050             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2051             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2052             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2053
2054             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2055             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2056             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2057
2058             /**************************
2059              * CALCULATE INTERACTIONS *
2060              **************************/
2061
2062             r22              = _mm_mul_ps(rsq22,rinv22);
2063             r22              = _mm_andnot_ps(dummy_mask,r22);
2064
2065             /* EWALD ELECTROSTATICS */
2066
2067             /* Analytical PME correction */
2068             zeta2            = _mm_mul_ps(beta2,rsq22);
2069             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
2070             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2071             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2072             felec            = _mm_mul_ps(qq22,felec);
2073
2074             fscal            = felec;
2075
2076             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2077
2078              /* Update vectorial force */
2079             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2080             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2081             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2082
2083             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2084             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2085             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2086
2087             /**************************
2088              * CALCULATE INTERACTIONS *
2089              **************************/
2090
2091             r23              = _mm_mul_ps(rsq23,rinv23);
2092             r23              = _mm_andnot_ps(dummy_mask,r23);
2093
2094             /* EWALD ELECTROSTATICS */
2095
2096             /* Analytical PME correction */
2097             zeta2            = _mm_mul_ps(beta2,rsq23);
2098             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
2099             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2100             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2101             felec            = _mm_mul_ps(qq23,felec);
2102
2103             fscal            = felec;
2104
2105             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2106
2107              /* Update vectorial force */
2108             fix2             = _mm_macc_ps(dx23,fscal,fix2);
2109             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
2110             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
2111
2112             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
2113             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
2114             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
2115
2116             /**************************
2117              * CALCULATE INTERACTIONS *
2118              **************************/
2119
2120             r31              = _mm_mul_ps(rsq31,rinv31);
2121             r31              = _mm_andnot_ps(dummy_mask,r31);
2122
2123             /* EWALD ELECTROSTATICS */
2124
2125             /* Analytical PME correction */
2126             zeta2            = _mm_mul_ps(beta2,rsq31);
2127             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
2128             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2129             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2130             felec            = _mm_mul_ps(qq31,felec);
2131
2132             fscal            = felec;
2133
2134             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2135
2136              /* Update vectorial force */
2137             fix3             = _mm_macc_ps(dx31,fscal,fix3);
2138             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
2139             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
2140
2141             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
2142             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
2143             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
2144
2145             /**************************
2146              * CALCULATE INTERACTIONS *
2147              **************************/
2148
2149             r32              = _mm_mul_ps(rsq32,rinv32);
2150             r32              = _mm_andnot_ps(dummy_mask,r32);
2151
2152             /* EWALD ELECTROSTATICS */
2153
2154             /* Analytical PME correction */
2155             zeta2            = _mm_mul_ps(beta2,rsq32);
2156             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
2157             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2158             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2159             felec            = _mm_mul_ps(qq32,felec);
2160
2161             fscal            = felec;
2162
2163             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2164
2165              /* Update vectorial force */
2166             fix3             = _mm_macc_ps(dx32,fscal,fix3);
2167             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
2168             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
2169
2170             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
2171             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
2172             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
2173
2174             /**************************
2175              * CALCULATE INTERACTIONS *
2176              **************************/
2177
2178             r33              = _mm_mul_ps(rsq33,rinv33);
2179             r33              = _mm_andnot_ps(dummy_mask,r33);
2180
2181             /* EWALD ELECTROSTATICS */
2182
2183             /* Analytical PME correction */
2184             zeta2            = _mm_mul_ps(beta2,rsq33);
2185             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
2186             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2187             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2188             felec            = _mm_mul_ps(qq33,felec);
2189
2190             fscal            = felec;
2191
2192             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2193
2194              /* Update vectorial force */
2195             fix3             = _mm_macc_ps(dx33,fscal,fix3);
2196             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
2197             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
2198
2199             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
2200             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
2201             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
2202
2203             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2204             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2205             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2206             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2207
2208             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2209                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2210                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2211
2212             /* Inner loop uses 316 flops */
2213         }
2214
2215         /* End of innermost loop */
2216
2217         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2218                                               f+i_coord_offset,fshift+i_shift_offset);
2219
2220         /* Increment number of inner iterations */
2221         inneriter                  += j_index_end - j_index_start;
2222
2223         /* Outer loop uses 24 flops */
2224     }
2225
2226     /* Increment number of outer iterations */
2227     outeriter        += nri;
2228
2229     /* Update outer/inner flops */
2230
2231     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*316);
2232 }