Merge release-5-0 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEw_VdwCSTab_GeomW3W3_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
50 #include "kernelutil_x86_avx_128_fma_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_128_fma_single
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            CubicSplineTable
56  * Geometry:                   Water3-Water3
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_avx_128_fma_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset0;
86     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     int              vdwioffset1;
88     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     int              vdwioffset2;
90     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     int              nvdwtype;
109     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110     int              *vdwtype;
111     real             *vdwparam;
112     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
113     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
114     __m128i          vfitab;
115     __m128i          ifour       = _mm_set1_epi32(4);
116     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
117     real             *vftab;
118     __m128i          ewitab;
119     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
120     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
121     real             *ewtab;
122     __m128           dummy_mask,cutoff_mask;
123     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
124     __m128           one     = _mm_set1_ps(1.0);
125     __m128           two     = _mm_set1_ps(2.0);
126     x                = xx[0];
127     f                = ff[0];
128
129     nri              = nlist->nri;
130     iinr             = nlist->iinr;
131     jindex           = nlist->jindex;
132     jjnr             = nlist->jjnr;
133     shiftidx         = nlist->shift;
134     gid              = nlist->gid;
135     shiftvec         = fr->shift_vec[0];
136     fshift           = fr->fshift[0];
137     facel            = _mm_set1_ps(fr->epsfac);
138     charge           = mdatoms->chargeA;
139     nvdwtype         = fr->ntype;
140     vdwparam         = fr->nbfp;
141     vdwtype          = mdatoms->typeA;
142
143     vftab            = kernel_data->table_vdw->data;
144     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
145
146     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
147     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
148     beta2            = _mm_mul_ps(beta,beta);
149     beta3            = _mm_mul_ps(beta,beta2);
150     ewtab            = fr->ic->tabq_coul_FDV0;
151     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
152     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
153
154     /* Setup water-specific parameters */
155     inr              = nlist->iinr[0];
156     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
157     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
158     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
159     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
160
161     jq0              = _mm_set1_ps(charge[inr+0]);
162     jq1              = _mm_set1_ps(charge[inr+1]);
163     jq2              = _mm_set1_ps(charge[inr+2]);
164     vdwjidx0A        = 2*vdwtype[inr+0];
165     qq00             = _mm_mul_ps(iq0,jq0);
166     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
167     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
168     qq01             = _mm_mul_ps(iq0,jq1);
169     qq02             = _mm_mul_ps(iq0,jq2);
170     qq10             = _mm_mul_ps(iq1,jq0);
171     qq11             = _mm_mul_ps(iq1,jq1);
172     qq12             = _mm_mul_ps(iq1,jq2);
173     qq20             = _mm_mul_ps(iq2,jq0);
174     qq21             = _mm_mul_ps(iq2,jq1);
175     qq22             = _mm_mul_ps(iq2,jq2);
176
177     /* Avoid stupid compiler warnings */
178     jnrA = jnrB = jnrC = jnrD = 0;
179     j_coord_offsetA = 0;
180     j_coord_offsetB = 0;
181     j_coord_offsetC = 0;
182     j_coord_offsetD = 0;
183
184     outeriter        = 0;
185     inneriter        = 0;
186
187     for(iidx=0;iidx<4*DIM;iidx++)
188     {
189         scratch[iidx] = 0.0;
190     }
191
192     /* Start outer loop over neighborlists */
193     for(iidx=0; iidx<nri; iidx++)
194     {
195         /* Load shift vector for this list */
196         i_shift_offset   = DIM*shiftidx[iidx];
197
198         /* Load limits for loop over neighbors */
199         j_index_start    = jindex[iidx];
200         j_index_end      = jindex[iidx+1];
201
202         /* Get outer coordinate index */
203         inr              = iinr[iidx];
204         i_coord_offset   = DIM*inr;
205
206         /* Load i particle coords and add shift vector */
207         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
208                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
209
210         fix0             = _mm_setzero_ps();
211         fiy0             = _mm_setzero_ps();
212         fiz0             = _mm_setzero_ps();
213         fix1             = _mm_setzero_ps();
214         fiy1             = _mm_setzero_ps();
215         fiz1             = _mm_setzero_ps();
216         fix2             = _mm_setzero_ps();
217         fiy2             = _mm_setzero_ps();
218         fiz2             = _mm_setzero_ps();
219
220         /* Reset potential sums */
221         velecsum         = _mm_setzero_ps();
222         vvdwsum          = _mm_setzero_ps();
223
224         /* Start inner kernel loop */
225         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
226         {
227
228             /* Get j neighbor index, and coordinate index */
229             jnrA             = jjnr[jidx];
230             jnrB             = jjnr[jidx+1];
231             jnrC             = jjnr[jidx+2];
232             jnrD             = jjnr[jidx+3];
233             j_coord_offsetA  = DIM*jnrA;
234             j_coord_offsetB  = DIM*jnrB;
235             j_coord_offsetC  = DIM*jnrC;
236             j_coord_offsetD  = DIM*jnrD;
237
238             /* load j atom coordinates */
239             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
240                                               x+j_coord_offsetC,x+j_coord_offsetD,
241                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
242
243             /* Calculate displacement vector */
244             dx00             = _mm_sub_ps(ix0,jx0);
245             dy00             = _mm_sub_ps(iy0,jy0);
246             dz00             = _mm_sub_ps(iz0,jz0);
247             dx01             = _mm_sub_ps(ix0,jx1);
248             dy01             = _mm_sub_ps(iy0,jy1);
249             dz01             = _mm_sub_ps(iz0,jz1);
250             dx02             = _mm_sub_ps(ix0,jx2);
251             dy02             = _mm_sub_ps(iy0,jy2);
252             dz02             = _mm_sub_ps(iz0,jz2);
253             dx10             = _mm_sub_ps(ix1,jx0);
254             dy10             = _mm_sub_ps(iy1,jy0);
255             dz10             = _mm_sub_ps(iz1,jz0);
256             dx11             = _mm_sub_ps(ix1,jx1);
257             dy11             = _mm_sub_ps(iy1,jy1);
258             dz11             = _mm_sub_ps(iz1,jz1);
259             dx12             = _mm_sub_ps(ix1,jx2);
260             dy12             = _mm_sub_ps(iy1,jy2);
261             dz12             = _mm_sub_ps(iz1,jz2);
262             dx20             = _mm_sub_ps(ix2,jx0);
263             dy20             = _mm_sub_ps(iy2,jy0);
264             dz20             = _mm_sub_ps(iz2,jz0);
265             dx21             = _mm_sub_ps(ix2,jx1);
266             dy21             = _mm_sub_ps(iy2,jy1);
267             dz21             = _mm_sub_ps(iz2,jz1);
268             dx22             = _mm_sub_ps(ix2,jx2);
269             dy22             = _mm_sub_ps(iy2,jy2);
270             dz22             = _mm_sub_ps(iz2,jz2);
271
272             /* Calculate squared distance and things based on it */
273             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
274             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
275             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
276             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
277             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
278             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
279             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
280             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
281             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
282
283             rinv00           = gmx_mm_invsqrt_ps(rsq00);
284             rinv01           = gmx_mm_invsqrt_ps(rsq01);
285             rinv02           = gmx_mm_invsqrt_ps(rsq02);
286             rinv10           = gmx_mm_invsqrt_ps(rsq10);
287             rinv11           = gmx_mm_invsqrt_ps(rsq11);
288             rinv12           = gmx_mm_invsqrt_ps(rsq12);
289             rinv20           = gmx_mm_invsqrt_ps(rsq20);
290             rinv21           = gmx_mm_invsqrt_ps(rsq21);
291             rinv22           = gmx_mm_invsqrt_ps(rsq22);
292
293             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
294             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
295             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
296             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
297             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
298             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
299             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
300             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
301             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
302
303             fjx0             = _mm_setzero_ps();
304             fjy0             = _mm_setzero_ps();
305             fjz0             = _mm_setzero_ps();
306             fjx1             = _mm_setzero_ps();
307             fjy1             = _mm_setzero_ps();
308             fjz1             = _mm_setzero_ps();
309             fjx2             = _mm_setzero_ps();
310             fjy2             = _mm_setzero_ps();
311             fjz2             = _mm_setzero_ps();
312
313             /**************************
314              * CALCULATE INTERACTIONS *
315              **************************/
316
317             r00              = _mm_mul_ps(rsq00,rinv00);
318
319             /* Calculate table index by multiplying r with table scale and truncate to integer */
320             rt               = _mm_mul_ps(r00,vftabscale);
321             vfitab           = _mm_cvttps_epi32(rt);
322 #ifdef __XOP__
323             vfeps            = _mm_frcz_ps(rt);
324 #else
325             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
326 #endif
327             twovfeps         = _mm_add_ps(vfeps,vfeps);
328             vfitab           = _mm_slli_epi32(vfitab,3);
329
330             /* EWALD ELECTROSTATICS */
331
332             /* Analytical PME correction */
333             zeta2            = _mm_mul_ps(beta2,rsq00);
334             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
335             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
336             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
337             felec            = _mm_mul_ps(qq00,felec);
338             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
339             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
340             velec            = _mm_mul_ps(qq00,velec);
341
342             /* CUBIC SPLINE TABLE DISPERSION */
343             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
344             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
345             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
346             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
347             _MM_TRANSPOSE4_PS(Y,F,G,H);
348             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
349             VV               = _mm_macc_ps(vfeps,Fp,Y);
350             vvdw6            = _mm_mul_ps(c6_00,VV);
351             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
352             fvdw6            = _mm_mul_ps(c6_00,FF);
353
354             /* CUBIC SPLINE TABLE REPULSION */
355             vfitab           = _mm_add_epi32(vfitab,ifour);
356             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
357             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
358             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
359             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
360             _MM_TRANSPOSE4_PS(Y,F,G,H);
361             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
362             VV               = _mm_macc_ps(vfeps,Fp,Y);
363             vvdw12           = _mm_mul_ps(c12_00,VV);
364             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
365             fvdw12           = _mm_mul_ps(c12_00,FF);
366             vvdw             = _mm_add_ps(vvdw12,vvdw6);
367             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
368
369             /* Update potential sum for this i atom from the interaction with this j atom. */
370             velecsum         = _mm_add_ps(velecsum,velec);
371             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
372
373             fscal            = _mm_add_ps(felec,fvdw);
374
375              /* Update vectorial force */
376             fix0             = _mm_macc_ps(dx00,fscal,fix0);
377             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
378             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
379
380             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
381             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
382             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
383
384             /**************************
385              * CALCULATE INTERACTIONS *
386              **************************/
387
388             r01              = _mm_mul_ps(rsq01,rinv01);
389
390             /* EWALD ELECTROSTATICS */
391
392             /* Analytical PME correction */
393             zeta2            = _mm_mul_ps(beta2,rsq01);
394             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
395             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
396             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
397             felec            = _mm_mul_ps(qq01,felec);
398             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
399             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
400             velec            = _mm_mul_ps(qq01,velec);
401
402             /* Update potential sum for this i atom from the interaction with this j atom. */
403             velecsum         = _mm_add_ps(velecsum,velec);
404
405             fscal            = felec;
406
407              /* Update vectorial force */
408             fix0             = _mm_macc_ps(dx01,fscal,fix0);
409             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
410             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
411
412             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
413             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
414             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
415
416             /**************************
417              * CALCULATE INTERACTIONS *
418              **************************/
419
420             r02              = _mm_mul_ps(rsq02,rinv02);
421
422             /* EWALD ELECTROSTATICS */
423
424             /* Analytical PME correction */
425             zeta2            = _mm_mul_ps(beta2,rsq02);
426             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
427             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
428             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
429             felec            = _mm_mul_ps(qq02,felec);
430             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
431             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
432             velec            = _mm_mul_ps(qq02,velec);
433
434             /* Update potential sum for this i atom from the interaction with this j atom. */
435             velecsum         = _mm_add_ps(velecsum,velec);
436
437             fscal            = felec;
438
439              /* Update vectorial force */
440             fix0             = _mm_macc_ps(dx02,fscal,fix0);
441             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
442             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
443
444             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
445             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
446             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
447
448             /**************************
449              * CALCULATE INTERACTIONS *
450              **************************/
451
452             r10              = _mm_mul_ps(rsq10,rinv10);
453
454             /* EWALD ELECTROSTATICS */
455
456             /* Analytical PME correction */
457             zeta2            = _mm_mul_ps(beta2,rsq10);
458             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
459             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
460             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
461             felec            = _mm_mul_ps(qq10,felec);
462             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
463             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
464             velec            = _mm_mul_ps(qq10,velec);
465
466             /* Update potential sum for this i atom from the interaction with this j atom. */
467             velecsum         = _mm_add_ps(velecsum,velec);
468
469             fscal            = felec;
470
471              /* Update vectorial force */
472             fix1             = _mm_macc_ps(dx10,fscal,fix1);
473             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
474             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
475
476             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
477             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
478             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
479
480             /**************************
481              * CALCULATE INTERACTIONS *
482              **************************/
483
484             r11              = _mm_mul_ps(rsq11,rinv11);
485
486             /* EWALD ELECTROSTATICS */
487
488             /* Analytical PME correction */
489             zeta2            = _mm_mul_ps(beta2,rsq11);
490             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
491             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
492             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
493             felec            = _mm_mul_ps(qq11,felec);
494             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
495             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
496             velec            = _mm_mul_ps(qq11,velec);
497
498             /* Update potential sum for this i atom from the interaction with this j atom. */
499             velecsum         = _mm_add_ps(velecsum,velec);
500
501             fscal            = felec;
502
503              /* Update vectorial force */
504             fix1             = _mm_macc_ps(dx11,fscal,fix1);
505             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
506             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
507
508             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
509             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
510             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
511
512             /**************************
513              * CALCULATE INTERACTIONS *
514              **************************/
515
516             r12              = _mm_mul_ps(rsq12,rinv12);
517
518             /* EWALD ELECTROSTATICS */
519
520             /* Analytical PME correction */
521             zeta2            = _mm_mul_ps(beta2,rsq12);
522             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
523             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
524             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
525             felec            = _mm_mul_ps(qq12,felec);
526             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
527             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
528             velec            = _mm_mul_ps(qq12,velec);
529
530             /* Update potential sum for this i atom from the interaction with this j atom. */
531             velecsum         = _mm_add_ps(velecsum,velec);
532
533             fscal            = felec;
534
535              /* Update vectorial force */
536             fix1             = _mm_macc_ps(dx12,fscal,fix1);
537             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
538             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
539
540             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
541             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
542             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
543
544             /**************************
545              * CALCULATE INTERACTIONS *
546              **************************/
547
548             r20              = _mm_mul_ps(rsq20,rinv20);
549
550             /* EWALD ELECTROSTATICS */
551
552             /* Analytical PME correction */
553             zeta2            = _mm_mul_ps(beta2,rsq20);
554             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
555             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
556             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
557             felec            = _mm_mul_ps(qq20,felec);
558             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
559             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
560             velec            = _mm_mul_ps(qq20,velec);
561
562             /* Update potential sum for this i atom from the interaction with this j atom. */
563             velecsum         = _mm_add_ps(velecsum,velec);
564
565             fscal            = felec;
566
567              /* Update vectorial force */
568             fix2             = _mm_macc_ps(dx20,fscal,fix2);
569             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
570             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
571
572             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
573             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
574             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
575
576             /**************************
577              * CALCULATE INTERACTIONS *
578              **************************/
579
580             r21              = _mm_mul_ps(rsq21,rinv21);
581
582             /* EWALD ELECTROSTATICS */
583
584             /* Analytical PME correction */
585             zeta2            = _mm_mul_ps(beta2,rsq21);
586             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
587             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
588             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
589             felec            = _mm_mul_ps(qq21,felec);
590             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
591             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
592             velec            = _mm_mul_ps(qq21,velec);
593
594             /* Update potential sum for this i atom from the interaction with this j atom. */
595             velecsum         = _mm_add_ps(velecsum,velec);
596
597             fscal            = felec;
598
599              /* Update vectorial force */
600             fix2             = _mm_macc_ps(dx21,fscal,fix2);
601             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
602             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
603
604             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
605             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
606             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
607
608             /**************************
609              * CALCULATE INTERACTIONS *
610              **************************/
611
612             r22              = _mm_mul_ps(rsq22,rinv22);
613
614             /* EWALD ELECTROSTATICS */
615
616             /* Analytical PME correction */
617             zeta2            = _mm_mul_ps(beta2,rsq22);
618             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
619             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
620             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
621             felec            = _mm_mul_ps(qq22,felec);
622             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
623             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
624             velec            = _mm_mul_ps(qq22,velec);
625
626             /* Update potential sum for this i atom from the interaction with this j atom. */
627             velecsum         = _mm_add_ps(velecsum,velec);
628
629             fscal            = felec;
630
631              /* Update vectorial force */
632             fix2             = _mm_macc_ps(dx22,fscal,fix2);
633             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
634             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
635
636             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
637             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
638             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
639
640             fjptrA             = f+j_coord_offsetA;
641             fjptrB             = f+j_coord_offsetB;
642             fjptrC             = f+j_coord_offsetC;
643             fjptrD             = f+j_coord_offsetD;
644
645             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
646                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
647
648             /* Inner loop uses 295 flops */
649         }
650
651         if(jidx<j_index_end)
652         {
653
654             /* Get j neighbor index, and coordinate index */
655             jnrlistA         = jjnr[jidx];
656             jnrlistB         = jjnr[jidx+1];
657             jnrlistC         = jjnr[jidx+2];
658             jnrlistD         = jjnr[jidx+3];
659             /* Sign of each element will be negative for non-real atoms.
660              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
661              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
662              */
663             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
664             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
665             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
666             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
667             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
668             j_coord_offsetA  = DIM*jnrA;
669             j_coord_offsetB  = DIM*jnrB;
670             j_coord_offsetC  = DIM*jnrC;
671             j_coord_offsetD  = DIM*jnrD;
672
673             /* load j atom coordinates */
674             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
675                                               x+j_coord_offsetC,x+j_coord_offsetD,
676                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
677
678             /* Calculate displacement vector */
679             dx00             = _mm_sub_ps(ix0,jx0);
680             dy00             = _mm_sub_ps(iy0,jy0);
681             dz00             = _mm_sub_ps(iz0,jz0);
682             dx01             = _mm_sub_ps(ix0,jx1);
683             dy01             = _mm_sub_ps(iy0,jy1);
684             dz01             = _mm_sub_ps(iz0,jz1);
685             dx02             = _mm_sub_ps(ix0,jx2);
686             dy02             = _mm_sub_ps(iy0,jy2);
687             dz02             = _mm_sub_ps(iz0,jz2);
688             dx10             = _mm_sub_ps(ix1,jx0);
689             dy10             = _mm_sub_ps(iy1,jy0);
690             dz10             = _mm_sub_ps(iz1,jz0);
691             dx11             = _mm_sub_ps(ix1,jx1);
692             dy11             = _mm_sub_ps(iy1,jy1);
693             dz11             = _mm_sub_ps(iz1,jz1);
694             dx12             = _mm_sub_ps(ix1,jx2);
695             dy12             = _mm_sub_ps(iy1,jy2);
696             dz12             = _mm_sub_ps(iz1,jz2);
697             dx20             = _mm_sub_ps(ix2,jx0);
698             dy20             = _mm_sub_ps(iy2,jy0);
699             dz20             = _mm_sub_ps(iz2,jz0);
700             dx21             = _mm_sub_ps(ix2,jx1);
701             dy21             = _mm_sub_ps(iy2,jy1);
702             dz21             = _mm_sub_ps(iz2,jz1);
703             dx22             = _mm_sub_ps(ix2,jx2);
704             dy22             = _mm_sub_ps(iy2,jy2);
705             dz22             = _mm_sub_ps(iz2,jz2);
706
707             /* Calculate squared distance and things based on it */
708             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
709             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
710             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
711             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
712             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
713             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
714             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
715             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
716             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
717
718             rinv00           = gmx_mm_invsqrt_ps(rsq00);
719             rinv01           = gmx_mm_invsqrt_ps(rsq01);
720             rinv02           = gmx_mm_invsqrt_ps(rsq02);
721             rinv10           = gmx_mm_invsqrt_ps(rsq10);
722             rinv11           = gmx_mm_invsqrt_ps(rsq11);
723             rinv12           = gmx_mm_invsqrt_ps(rsq12);
724             rinv20           = gmx_mm_invsqrt_ps(rsq20);
725             rinv21           = gmx_mm_invsqrt_ps(rsq21);
726             rinv22           = gmx_mm_invsqrt_ps(rsq22);
727
728             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
729             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
730             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
731             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
732             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
733             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
734             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
735             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
736             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
737
738             fjx0             = _mm_setzero_ps();
739             fjy0             = _mm_setzero_ps();
740             fjz0             = _mm_setzero_ps();
741             fjx1             = _mm_setzero_ps();
742             fjy1             = _mm_setzero_ps();
743             fjz1             = _mm_setzero_ps();
744             fjx2             = _mm_setzero_ps();
745             fjy2             = _mm_setzero_ps();
746             fjz2             = _mm_setzero_ps();
747
748             /**************************
749              * CALCULATE INTERACTIONS *
750              **************************/
751
752             r00              = _mm_mul_ps(rsq00,rinv00);
753             r00              = _mm_andnot_ps(dummy_mask,r00);
754
755             /* Calculate table index by multiplying r with table scale and truncate to integer */
756             rt               = _mm_mul_ps(r00,vftabscale);
757             vfitab           = _mm_cvttps_epi32(rt);
758 #ifdef __XOP__
759             vfeps            = _mm_frcz_ps(rt);
760 #else
761             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
762 #endif
763             twovfeps         = _mm_add_ps(vfeps,vfeps);
764             vfitab           = _mm_slli_epi32(vfitab,3);
765
766             /* EWALD ELECTROSTATICS */
767
768             /* Analytical PME correction */
769             zeta2            = _mm_mul_ps(beta2,rsq00);
770             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
771             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
772             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
773             felec            = _mm_mul_ps(qq00,felec);
774             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
775             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
776             velec            = _mm_mul_ps(qq00,velec);
777
778             /* CUBIC SPLINE TABLE DISPERSION */
779             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
780             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
781             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
782             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
783             _MM_TRANSPOSE4_PS(Y,F,G,H);
784             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
785             VV               = _mm_macc_ps(vfeps,Fp,Y);
786             vvdw6            = _mm_mul_ps(c6_00,VV);
787             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
788             fvdw6            = _mm_mul_ps(c6_00,FF);
789
790             /* CUBIC SPLINE TABLE REPULSION */
791             vfitab           = _mm_add_epi32(vfitab,ifour);
792             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
793             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
794             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
795             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
796             _MM_TRANSPOSE4_PS(Y,F,G,H);
797             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
798             VV               = _mm_macc_ps(vfeps,Fp,Y);
799             vvdw12           = _mm_mul_ps(c12_00,VV);
800             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
801             fvdw12           = _mm_mul_ps(c12_00,FF);
802             vvdw             = _mm_add_ps(vvdw12,vvdw6);
803             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
804
805             /* Update potential sum for this i atom from the interaction with this j atom. */
806             velec            = _mm_andnot_ps(dummy_mask,velec);
807             velecsum         = _mm_add_ps(velecsum,velec);
808             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
809             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
810
811             fscal            = _mm_add_ps(felec,fvdw);
812
813             fscal            = _mm_andnot_ps(dummy_mask,fscal);
814
815              /* Update vectorial force */
816             fix0             = _mm_macc_ps(dx00,fscal,fix0);
817             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
818             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
819
820             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
821             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
822             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
823
824             /**************************
825              * CALCULATE INTERACTIONS *
826              **************************/
827
828             r01              = _mm_mul_ps(rsq01,rinv01);
829             r01              = _mm_andnot_ps(dummy_mask,r01);
830
831             /* EWALD ELECTROSTATICS */
832
833             /* Analytical PME correction */
834             zeta2            = _mm_mul_ps(beta2,rsq01);
835             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
836             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
837             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
838             felec            = _mm_mul_ps(qq01,felec);
839             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
840             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
841             velec            = _mm_mul_ps(qq01,velec);
842
843             /* Update potential sum for this i atom from the interaction with this j atom. */
844             velec            = _mm_andnot_ps(dummy_mask,velec);
845             velecsum         = _mm_add_ps(velecsum,velec);
846
847             fscal            = felec;
848
849             fscal            = _mm_andnot_ps(dummy_mask,fscal);
850
851              /* Update vectorial force */
852             fix0             = _mm_macc_ps(dx01,fscal,fix0);
853             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
854             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
855
856             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
857             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
858             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
859
860             /**************************
861              * CALCULATE INTERACTIONS *
862              **************************/
863
864             r02              = _mm_mul_ps(rsq02,rinv02);
865             r02              = _mm_andnot_ps(dummy_mask,r02);
866
867             /* EWALD ELECTROSTATICS */
868
869             /* Analytical PME correction */
870             zeta2            = _mm_mul_ps(beta2,rsq02);
871             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
872             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
873             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
874             felec            = _mm_mul_ps(qq02,felec);
875             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
876             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
877             velec            = _mm_mul_ps(qq02,velec);
878
879             /* Update potential sum for this i atom from the interaction with this j atom. */
880             velec            = _mm_andnot_ps(dummy_mask,velec);
881             velecsum         = _mm_add_ps(velecsum,velec);
882
883             fscal            = felec;
884
885             fscal            = _mm_andnot_ps(dummy_mask,fscal);
886
887              /* Update vectorial force */
888             fix0             = _mm_macc_ps(dx02,fscal,fix0);
889             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
890             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
891
892             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
893             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
894             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
895
896             /**************************
897              * CALCULATE INTERACTIONS *
898              **************************/
899
900             r10              = _mm_mul_ps(rsq10,rinv10);
901             r10              = _mm_andnot_ps(dummy_mask,r10);
902
903             /* EWALD ELECTROSTATICS */
904
905             /* Analytical PME correction */
906             zeta2            = _mm_mul_ps(beta2,rsq10);
907             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
908             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
909             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
910             felec            = _mm_mul_ps(qq10,felec);
911             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
912             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
913             velec            = _mm_mul_ps(qq10,velec);
914
915             /* Update potential sum for this i atom from the interaction with this j atom. */
916             velec            = _mm_andnot_ps(dummy_mask,velec);
917             velecsum         = _mm_add_ps(velecsum,velec);
918
919             fscal            = felec;
920
921             fscal            = _mm_andnot_ps(dummy_mask,fscal);
922
923              /* Update vectorial force */
924             fix1             = _mm_macc_ps(dx10,fscal,fix1);
925             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
926             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
927
928             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
929             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
930             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
931
932             /**************************
933              * CALCULATE INTERACTIONS *
934              **************************/
935
936             r11              = _mm_mul_ps(rsq11,rinv11);
937             r11              = _mm_andnot_ps(dummy_mask,r11);
938
939             /* EWALD ELECTROSTATICS */
940
941             /* Analytical PME correction */
942             zeta2            = _mm_mul_ps(beta2,rsq11);
943             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
944             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
945             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
946             felec            = _mm_mul_ps(qq11,felec);
947             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
948             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
949             velec            = _mm_mul_ps(qq11,velec);
950
951             /* Update potential sum for this i atom from the interaction with this j atom. */
952             velec            = _mm_andnot_ps(dummy_mask,velec);
953             velecsum         = _mm_add_ps(velecsum,velec);
954
955             fscal            = felec;
956
957             fscal            = _mm_andnot_ps(dummy_mask,fscal);
958
959              /* Update vectorial force */
960             fix1             = _mm_macc_ps(dx11,fscal,fix1);
961             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
962             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
963
964             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
965             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
966             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
967
968             /**************************
969              * CALCULATE INTERACTIONS *
970              **************************/
971
972             r12              = _mm_mul_ps(rsq12,rinv12);
973             r12              = _mm_andnot_ps(dummy_mask,r12);
974
975             /* EWALD ELECTROSTATICS */
976
977             /* Analytical PME correction */
978             zeta2            = _mm_mul_ps(beta2,rsq12);
979             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
980             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
981             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
982             felec            = _mm_mul_ps(qq12,felec);
983             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
984             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
985             velec            = _mm_mul_ps(qq12,velec);
986
987             /* Update potential sum for this i atom from the interaction with this j atom. */
988             velec            = _mm_andnot_ps(dummy_mask,velec);
989             velecsum         = _mm_add_ps(velecsum,velec);
990
991             fscal            = felec;
992
993             fscal            = _mm_andnot_ps(dummy_mask,fscal);
994
995              /* Update vectorial force */
996             fix1             = _mm_macc_ps(dx12,fscal,fix1);
997             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
998             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
999
1000             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1001             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1002             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1003
1004             /**************************
1005              * CALCULATE INTERACTIONS *
1006              **************************/
1007
1008             r20              = _mm_mul_ps(rsq20,rinv20);
1009             r20              = _mm_andnot_ps(dummy_mask,r20);
1010
1011             /* EWALD ELECTROSTATICS */
1012
1013             /* Analytical PME correction */
1014             zeta2            = _mm_mul_ps(beta2,rsq20);
1015             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
1016             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1017             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1018             felec            = _mm_mul_ps(qq20,felec);
1019             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1020             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
1021             velec            = _mm_mul_ps(qq20,velec);
1022
1023             /* Update potential sum for this i atom from the interaction with this j atom. */
1024             velec            = _mm_andnot_ps(dummy_mask,velec);
1025             velecsum         = _mm_add_ps(velecsum,velec);
1026
1027             fscal            = felec;
1028
1029             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1030
1031              /* Update vectorial force */
1032             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1033             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1034             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1035
1036             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1037             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1038             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1039
1040             /**************************
1041              * CALCULATE INTERACTIONS *
1042              **************************/
1043
1044             r21              = _mm_mul_ps(rsq21,rinv21);
1045             r21              = _mm_andnot_ps(dummy_mask,r21);
1046
1047             /* EWALD ELECTROSTATICS */
1048
1049             /* Analytical PME correction */
1050             zeta2            = _mm_mul_ps(beta2,rsq21);
1051             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1052             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1053             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1054             felec            = _mm_mul_ps(qq21,felec);
1055             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1056             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
1057             velec            = _mm_mul_ps(qq21,velec);
1058
1059             /* Update potential sum for this i atom from the interaction with this j atom. */
1060             velec            = _mm_andnot_ps(dummy_mask,velec);
1061             velecsum         = _mm_add_ps(velecsum,velec);
1062
1063             fscal            = felec;
1064
1065             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1066
1067              /* Update vectorial force */
1068             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1069             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1070             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1071
1072             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1073             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1074             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1075
1076             /**************************
1077              * CALCULATE INTERACTIONS *
1078              **************************/
1079
1080             r22              = _mm_mul_ps(rsq22,rinv22);
1081             r22              = _mm_andnot_ps(dummy_mask,r22);
1082
1083             /* EWALD ELECTROSTATICS */
1084
1085             /* Analytical PME correction */
1086             zeta2            = _mm_mul_ps(beta2,rsq22);
1087             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1088             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1089             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1090             felec            = _mm_mul_ps(qq22,felec);
1091             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1092             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1093             velec            = _mm_mul_ps(qq22,velec);
1094
1095             /* Update potential sum for this i atom from the interaction with this j atom. */
1096             velec            = _mm_andnot_ps(dummy_mask,velec);
1097             velecsum         = _mm_add_ps(velecsum,velec);
1098
1099             fscal            = felec;
1100
1101             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1102
1103              /* Update vectorial force */
1104             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1105             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1106             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1107
1108             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1109             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1110             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1111
1112             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1113             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1114             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1115             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1116
1117             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1118                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1119
1120             /* Inner loop uses 304 flops */
1121         }
1122
1123         /* End of innermost loop */
1124
1125         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1126                                               f+i_coord_offset,fshift+i_shift_offset);
1127
1128         ggid                        = gid[iidx];
1129         /* Update potential energies */
1130         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1131         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1132
1133         /* Increment number of inner iterations */
1134         inneriter                  += j_index_end - j_index_start;
1135
1136         /* Outer loop uses 20 flops */
1137     }
1138
1139     /* Increment number of outer iterations */
1140     outeriter        += nri;
1141
1142     /* Update outer/inner flops */
1143
1144     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*304);
1145 }
1146 /*
1147  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_128_fma_single
1148  * Electrostatics interaction: Ewald
1149  * VdW interaction:            CubicSplineTable
1150  * Geometry:                   Water3-Water3
1151  * Calculate force/pot:        Force
1152  */
1153 void
1154 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_avx_128_fma_single
1155                     (t_nblist                    * gmx_restrict       nlist,
1156                      rvec                        * gmx_restrict          xx,
1157                      rvec                        * gmx_restrict          ff,
1158                      t_forcerec                  * gmx_restrict          fr,
1159                      t_mdatoms                   * gmx_restrict     mdatoms,
1160                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1161                      t_nrnb                      * gmx_restrict        nrnb)
1162 {
1163     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1164      * just 0 for non-waters.
1165      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1166      * jnr indices corresponding to data put in the four positions in the SIMD register.
1167      */
1168     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1169     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1170     int              jnrA,jnrB,jnrC,jnrD;
1171     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1172     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1173     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1174     real             rcutoff_scalar;
1175     real             *shiftvec,*fshift,*x,*f;
1176     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1177     real             scratch[4*DIM];
1178     __m128           fscal,rcutoff,rcutoff2,jidxall;
1179     int              vdwioffset0;
1180     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1181     int              vdwioffset1;
1182     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1183     int              vdwioffset2;
1184     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1185     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1186     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1187     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1188     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1189     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1190     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1191     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1192     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1193     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1194     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1195     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1196     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1197     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1198     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1199     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1200     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1201     real             *charge;
1202     int              nvdwtype;
1203     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1204     int              *vdwtype;
1205     real             *vdwparam;
1206     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1207     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1208     __m128i          vfitab;
1209     __m128i          ifour       = _mm_set1_epi32(4);
1210     __m128           rt,vfeps,twovfeps,vftabscale,Y,F,G,H,Fp,VV,FF;
1211     real             *vftab;
1212     __m128i          ewitab;
1213     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1214     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1215     real             *ewtab;
1216     __m128           dummy_mask,cutoff_mask;
1217     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1218     __m128           one     = _mm_set1_ps(1.0);
1219     __m128           two     = _mm_set1_ps(2.0);
1220     x                = xx[0];
1221     f                = ff[0];
1222
1223     nri              = nlist->nri;
1224     iinr             = nlist->iinr;
1225     jindex           = nlist->jindex;
1226     jjnr             = nlist->jjnr;
1227     shiftidx         = nlist->shift;
1228     gid              = nlist->gid;
1229     shiftvec         = fr->shift_vec[0];
1230     fshift           = fr->fshift[0];
1231     facel            = _mm_set1_ps(fr->epsfac);
1232     charge           = mdatoms->chargeA;
1233     nvdwtype         = fr->ntype;
1234     vdwparam         = fr->nbfp;
1235     vdwtype          = mdatoms->typeA;
1236
1237     vftab            = kernel_data->table_vdw->data;
1238     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
1239
1240     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1241     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1242     beta2            = _mm_mul_ps(beta,beta);
1243     beta3            = _mm_mul_ps(beta,beta2);
1244     ewtab            = fr->ic->tabq_coul_F;
1245     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1246     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1247
1248     /* Setup water-specific parameters */
1249     inr              = nlist->iinr[0];
1250     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1251     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1252     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1253     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1254
1255     jq0              = _mm_set1_ps(charge[inr+0]);
1256     jq1              = _mm_set1_ps(charge[inr+1]);
1257     jq2              = _mm_set1_ps(charge[inr+2]);
1258     vdwjidx0A        = 2*vdwtype[inr+0];
1259     qq00             = _mm_mul_ps(iq0,jq0);
1260     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1261     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1262     qq01             = _mm_mul_ps(iq0,jq1);
1263     qq02             = _mm_mul_ps(iq0,jq2);
1264     qq10             = _mm_mul_ps(iq1,jq0);
1265     qq11             = _mm_mul_ps(iq1,jq1);
1266     qq12             = _mm_mul_ps(iq1,jq2);
1267     qq20             = _mm_mul_ps(iq2,jq0);
1268     qq21             = _mm_mul_ps(iq2,jq1);
1269     qq22             = _mm_mul_ps(iq2,jq2);
1270
1271     /* Avoid stupid compiler warnings */
1272     jnrA = jnrB = jnrC = jnrD = 0;
1273     j_coord_offsetA = 0;
1274     j_coord_offsetB = 0;
1275     j_coord_offsetC = 0;
1276     j_coord_offsetD = 0;
1277
1278     outeriter        = 0;
1279     inneriter        = 0;
1280
1281     for(iidx=0;iidx<4*DIM;iidx++)
1282     {
1283         scratch[iidx] = 0.0;
1284     }
1285
1286     /* Start outer loop over neighborlists */
1287     for(iidx=0; iidx<nri; iidx++)
1288     {
1289         /* Load shift vector for this list */
1290         i_shift_offset   = DIM*shiftidx[iidx];
1291
1292         /* Load limits for loop over neighbors */
1293         j_index_start    = jindex[iidx];
1294         j_index_end      = jindex[iidx+1];
1295
1296         /* Get outer coordinate index */
1297         inr              = iinr[iidx];
1298         i_coord_offset   = DIM*inr;
1299
1300         /* Load i particle coords and add shift vector */
1301         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1302                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1303
1304         fix0             = _mm_setzero_ps();
1305         fiy0             = _mm_setzero_ps();
1306         fiz0             = _mm_setzero_ps();
1307         fix1             = _mm_setzero_ps();
1308         fiy1             = _mm_setzero_ps();
1309         fiz1             = _mm_setzero_ps();
1310         fix2             = _mm_setzero_ps();
1311         fiy2             = _mm_setzero_ps();
1312         fiz2             = _mm_setzero_ps();
1313
1314         /* Start inner kernel loop */
1315         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1316         {
1317
1318             /* Get j neighbor index, and coordinate index */
1319             jnrA             = jjnr[jidx];
1320             jnrB             = jjnr[jidx+1];
1321             jnrC             = jjnr[jidx+2];
1322             jnrD             = jjnr[jidx+3];
1323             j_coord_offsetA  = DIM*jnrA;
1324             j_coord_offsetB  = DIM*jnrB;
1325             j_coord_offsetC  = DIM*jnrC;
1326             j_coord_offsetD  = DIM*jnrD;
1327
1328             /* load j atom coordinates */
1329             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1330                                               x+j_coord_offsetC,x+j_coord_offsetD,
1331                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1332
1333             /* Calculate displacement vector */
1334             dx00             = _mm_sub_ps(ix0,jx0);
1335             dy00             = _mm_sub_ps(iy0,jy0);
1336             dz00             = _mm_sub_ps(iz0,jz0);
1337             dx01             = _mm_sub_ps(ix0,jx1);
1338             dy01             = _mm_sub_ps(iy0,jy1);
1339             dz01             = _mm_sub_ps(iz0,jz1);
1340             dx02             = _mm_sub_ps(ix0,jx2);
1341             dy02             = _mm_sub_ps(iy0,jy2);
1342             dz02             = _mm_sub_ps(iz0,jz2);
1343             dx10             = _mm_sub_ps(ix1,jx0);
1344             dy10             = _mm_sub_ps(iy1,jy0);
1345             dz10             = _mm_sub_ps(iz1,jz0);
1346             dx11             = _mm_sub_ps(ix1,jx1);
1347             dy11             = _mm_sub_ps(iy1,jy1);
1348             dz11             = _mm_sub_ps(iz1,jz1);
1349             dx12             = _mm_sub_ps(ix1,jx2);
1350             dy12             = _mm_sub_ps(iy1,jy2);
1351             dz12             = _mm_sub_ps(iz1,jz2);
1352             dx20             = _mm_sub_ps(ix2,jx0);
1353             dy20             = _mm_sub_ps(iy2,jy0);
1354             dz20             = _mm_sub_ps(iz2,jz0);
1355             dx21             = _mm_sub_ps(ix2,jx1);
1356             dy21             = _mm_sub_ps(iy2,jy1);
1357             dz21             = _mm_sub_ps(iz2,jz1);
1358             dx22             = _mm_sub_ps(ix2,jx2);
1359             dy22             = _mm_sub_ps(iy2,jy2);
1360             dz22             = _mm_sub_ps(iz2,jz2);
1361
1362             /* Calculate squared distance and things based on it */
1363             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1364             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1365             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1366             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1367             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1368             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1369             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1370             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1371             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1372
1373             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1374             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1375             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1376             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1377             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1378             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1379             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1380             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1381             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1382
1383             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1384             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1385             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1386             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1387             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1388             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1389             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1390             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1391             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1392
1393             fjx0             = _mm_setzero_ps();
1394             fjy0             = _mm_setzero_ps();
1395             fjz0             = _mm_setzero_ps();
1396             fjx1             = _mm_setzero_ps();
1397             fjy1             = _mm_setzero_ps();
1398             fjz1             = _mm_setzero_ps();
1399             fjx2             = _mm_setzero_ps();
1400             fjy2             = _mm_setzero_ps();
1401             fjz2             = _mm_setzero_ps();
1402
1403             /**************************
1404              * CALCULATE INTERACTIONS *
1405              **************************/
1406
1407             r00              = _mm_mul_ps(rsq00,rinv00);
1408
1409             /* Calculate table index by multiplying r with table scale and truncate to integer */
1410             rt               = _mm_mul_ps(r00,vftabscale);
1411             vfitab           = _mm_cvttps_epi32(rt);
1412 #ifdef __XOP__
1413             vfeps            = _mm_frcz_ps(rt);
1414 #else
1415             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1416 #endif
1417             twovfeps         = _mm_add_ps(vfeps,vfeps);
1418             vfitab           = _mm_slli_epi32(vfitab,3);
1419
1420             /* EWALD ELECTROSTATICS */
1421
1422             /* Analytical PME correction */
1423             zeta2            = _mm_mul_ps(beta2,rsq00);
1424             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
1425             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1426             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1427             felec            = _mm_mul_ps(qq00,felec);
1428
1429             /* CUBIC SPLINE TABLE DISPERSION */
1430             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1431             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1432             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1433             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1434             _MM_TRANSPOSE4_PS(Y,F,G,H);
1435             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1436             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1437             fvdw6            = _mm_mul_ps(c6_00,FF);
1438
1439             /* CUBIC SPLINE TABLE REPULSION */
1440             vfitab           = _mm_add_epi32(vfitab,ifour);
1441             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1442             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1443             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1444             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1445             _MM_TRANSPOSE4_PS(Y,F,G,H);
1446             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1447             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1448             fvdw12           = _mm_mul_ps(c12_00,FF);
1449             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1450
1451             fscal            = _mm_add_ps(felec,fvdw);
1452
1453              /* Update vectorial force */
1454             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1455             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1456             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1457
1458             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1459             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1460             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1461
1462             /**************************
1463              * CALCULATE INTERACTIONS *
1464              **************************/
1465
1466             r01              = _mm_mul_ps(rsq01,rinv01);
1467
1468             /* EWALD ELECTROSTATICS */
1469
1470             /* Analytical PME correction */
1471             zeta2            = _mm_mul_ps(beta2,rsq01);
1472             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
1473             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1474             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1475             felec            = _mm_mul_ps(qq01,felec);
1476
1477             fscal            = felec;
1478
1479              /* Update vectorial force */
1480             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1481             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1482             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1483
1484             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1485             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1486             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1487
1488             /**************************
1489              * CALCULATE INTERACTIONS *
1490              **************************/
1491
1492             r02              = _mm_mul_ps(rsq02,rinv02);
1493
1494             /* EWALD ELECTROSTATICS */
1495
1496             /* Analytical PME correction */
1497             zeta2            = _mm_mul_ps(beta2,rsq02);
1498             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
1499             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1500             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1501             felec            = _mm_mul_ps(qq02,felec);
1502
1503             fscal            = felec;
1504
1505              /* Update vectorial force */
1506             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1507             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1508             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1509
1510             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1511             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1512             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1513
1514             /**************************
1515              * CALCULATE INTERACTIONS *
1516              **************************/
1517
1518             r10              = _mm_mul_ps(rsq10,rinv10);
1519
1520             /* EWALD ELECTROSTATICS */
1521
1522             /* Analytical PME correction */
1523             zeta2            = _mm_mul_ps(beta2,rsq10);
1524             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1525             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1526             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1527             felec            = _mm_mul_ps(qq10,felec);
1528
1529             fscal            = felec;
1530
1531              /* Update vectorial force */
1532             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1533             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1534             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1535
1536             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1537             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1538             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1539
1540             /**************************
1541              * CALCULATE INTERACTIONS *
1542              **************************/
1543
1544             r11              = _mm_mul_ps(rsq11,rinv11);
1545
1546             /* EWALD ELECTROSTATICS */
1547
1548             /* Analytical PME correction */
1549             zeta2            = _mm_mul_ps(beta2,rsq11);
1550             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1551             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1552             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1553             felec            = _mm_mul_ps(qq11,felec);
1554
1555             fscal            = felec;
1556
1557              /* Update vectorial force */
1558             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1559             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1560             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1561
1562             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1563             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1564             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1565
1566             /**************************
1567              * CALCULATE INTERACTIONS *
1568              **************************/
1569
1570             r12              = _mm_mul_ps(rsq12,rinv12);
1571
1572             /* EWALD ELECTROSTATICS */
1573
1574             /* Analytical PME correction */
1575             zeta2            = _mm_mul_ps(beta2,rsq12);
1576             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1577             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1578             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1579             felec            = _mm_mul_ps(qq12,felec);
1580
1581             fscal            = felec;
1582
1583              /* Update vectorial force */
1584             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1585             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1586             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1587
1588             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1589             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1590             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1591
1592             /**************************
1593              * CALCULATE INTERACTIONS *
1594              **************************/
1595
1596             r20              = _mm_mul_ps(rsq20,rinv20);
1597
1598             /* EWALD ELECTROSTATICS */
1599
1600             /* Analytical PME correction */
1601             zeta2            = _mm_mul_ps(beta2,rsq20);
1602             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
1603             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1604             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1605             felec            = _mm_mul_ps(qq20,felec);
1606
1607             fscal            = felec;
1608
1609              /* Update vectorial force */
1610             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1611             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1612             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1613
1614             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1615             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1616             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1617
1618             /**************************
1619              * CALCULATE INTERACTIONS *
1620              **************************/
1621
1622             r21              = _mm_mul_ps(rsq21,rinv21);
1623
1624             /* EWALD ELECTROSTATICS */
1625
1626             /* Analytical PME correction */
1627             zeta2            = _mm_mul_ps(beta2,rsq21);
1628             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1629             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1630             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1631             felec            = _mm_mul_ps(qq21,felec);
1632
1633             fscal            = felec;
1634
1635              /* Update vectorial force */
1636             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1637             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1638             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1639
1640             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1641             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1642             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1643
1644             /**************************
1645              * CALCULATE INTERACTIONS *
1646              **************************/
1647
1648             r22              = _mm_mul_ps(rsq22,rinv22);
1649
1650             /* EWALD ELECTROSTATICS */
1651
1652             /* Analytical PME correction */
1653             zeta2            = _mm_mul_ps(beta2,rsq22);
1654             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1655             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1656             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1657             felec            = _mm_mul_ps(qq22,felec);
1658
1659             fscal            = felec;
1660
1661              /* Update vectorial force */
1662             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1663             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1664             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1665
1666             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1667             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1668             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1669
1670             fjptrA             = f+j_coord_offsetA;
1671             fjptrB             = f+j_coord_offsetB;
1672             fjptrC             = f+j_coord_offsetC;
1673             fjptrD             = f+j_coord_offsetD;
1674
1675             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1676                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1677
1678             /* Inner loop uses 278 flops */
1679         }
1680
1681         if(jidx<j_index_end)
1682         {
1683
1684             /* Get j neighbor index, and coordinate index */
1685             jnrlistA         = jjnr[jidx];
1686             jnrlistB         = jjnr[jidx+1];
1687             jnrlistC         = jjnr[jidx+2];
1688             jnrlistD         = jjnr[jidx+3];
1689             /* Sign of each element will be negative for non-real atoms.
1690              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1691              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1692              */
1693             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1694             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1695             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1696             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1697             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1698             j_coord_offsetA  = DIM*jnrA;
1699             j_coord_offsetB  = DIM*jnrB;
1700             j_coord_offsetC  = DIM*jnrC;
1701             j_coord_offsetD  = DIM*jnrD;
1702
1703             /* load j atom coordinates */
1704             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1705                                               x+j_coord_offsetC,x+j_coord_offsetD,
1706                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1707
1708             /* Calculate displacement vector */
1709             dx00             = _mm_sub_ps(ix0,jx0);
1710             dy00             = _mm_sub_ps(iy0,jy0);
1711             dz00             = _mm_sub_ps(iz0,jz0);
1712             dx01             = _mm_sub_ps(ix0,jx1);
1713             dy01             = _mm_sub_ps(iy0,jy1);
1714             dz01             = _mm_sub_ps(iz0,jz1);
1715             dx02             = _mm_sub_ps(ix0,jx2);
1716             dy02             = _mm_sub_ps(iy0,jy2);
1717             dz02             = _mm_sub_ps(iz0,jz2);
1718             dx10             = _mm_sub_ps(ix1,jx0);
1719             dy10             = _mm_sub_ps(iy1,jy0);
1720             dz10             = _mm_sub_ps(iz1,jz0);
1721             dx11             = _mm_sub_ps(ix1,jx1);
1722             dy11             = _mm_sub_ps(iy1,jy1);
1723             dz11             = _mm_sub_ps(iz1,jz1);
1724             dx12             = _mm_sub_ps(ix1,jx2);
1725             dy12             = _mm_sub_ps(iy1,jy2);
1726             dz12             = _mm_sub_ps(iz1,jz2);
1727             dx20             = _mm_sub_ps(ix2,jx0);
1728             dy20             = _mm_sub_ps(iy2,jy0);
1729             dz20             = _mm_sub_ps(iz2,jz0);
1730             dx21             = _mm_sub_ps(ix2,jx1);
1731             dy21             = _mm_sub_ps(iy2,jy1);
1732             dz21             = _mm_sub_ps(iz2,jz1);
1733             dx22             = _mm_sub_ps(ix2,jx2);
1734             dy22             = _mm_sub_ps(iy2,jy2);
1735             dz22             = _mm_sub_ps(iz2,jz2);
1736
1737             /* Calculate squared distance and things based on it */
1738             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1739             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1740             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1741             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1742             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1743             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1744             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1745             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1746             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1747
1748             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1749             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1750             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1751             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1752             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1753             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1754             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1755             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1756             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1757
1758             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1759             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1760             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1761             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1762             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1763             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1764             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1765             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1766             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1767
1768             fjx0             = _mm_setzero_ps();
1769             fjy0             = _mm_setzero_ps();
1770             fjz0             = _mm_setzero_ps();
1771             fjx1             = _mm_setzero_ps();
1772             fjy1             = _mm_setzero_ps();
1773             fjz1             = _mm_setzero_ps();
1774             fjx2             = _mm_setzero_ps();
1775             fjy2             = _mm_setzero_ps();
1776             fjz2             = _mm_setzero_ps();
1777
1778             /**************************
1779              * CALCULATE INTERACTIONS *
1780              **************************/
1781
1782             r00              = _mm_mul_ps(rsq00,rinv00);
1783             r00              = _mm_andnot_ps(dummy_mask,r00);
1784
1785             /* Calculate table index by multiplying r with table scale and truncate to integer */
1786             rt               = _mm_mul_ps(r00,vftabscale);
1787             vfitab           = _mm_cvttps_epi32(rt);
1788 #ifdef __XOP__
1789             vfeps            = _mm_frcz_ps(rt);
1790 #else
1791             vfeps            = _mm_sub_ps(rt,_mm_round_ps(rt, _MM_FROUND_FLOOR));
1792 #endif
1793             twovfeps         = _mm_add_ps(vfeps,vfeps);
1794             vfitab           = _mm_slli_epi32(vfitab,3);
1795
1796             /* EWALD ELECTROSTATICS */
1797
1798             /* Analytical PME correction */
1799             zeta2            = _mm_mul_ps(beta2,rsq00);
1800             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
1801             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1802             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1803             felec            = _mm_mul_ps(qq00,felec);
1804
1805             /* CUBIC SPLINE TABLE DISPERSION */
1806             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1807             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1808             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1809             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1810             _MM_TRANSPOSE4_PS(Y,F,G,H);
1811             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1812             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1813             fvdw6            = _mm_mul_ps(c6_00,FF);
1814
1815             /* CUBIC SPLINE TABLE REPULSION */
1816             vfitab           = _mm_add_epi32(vfitab,ifour);
1817             Y                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,0) );
1818             F                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,1) );
1819             G                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,2) );
1820             H                = _mm_load_ps( vftab + _mm_extract_epi32(vfitab,3) );
1821             _MM_TRANSPOSE4_PS(Y,F,G,H);
1822             Fp               = _mm_macc_ps(vfeps,_mm_macc_ps(H,vfeps,G),F);
1823             FF               = _mm_macc_ps(vfeps,_mm_macc_ps(twovfeps,H,G),Fp);
1824             fvdw12           = _mm_mul_ps(c12_00,FF);
1825             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1826
1827             fscal            = _mm_add_ps(felec,fvdw);
1828
1829             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1830
1831              /* Update vectorial force */
1832             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1833             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1834             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1835
1836             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1837             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1838             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1839
1840             /**************************
1841              * CALCULATE INTERACTIONS *
1842              **************************/
1843
1844             r01              = _mm_mul_ps(rsq01,rinv01);
1845             r01              = _mm_andnot_ps(dummy_mask,r01);
1846
1847             /* EWALD ELECTROSTATICS */
1848
1849             /* Analytical PME correction */
1850             zeta2            = _mm_mul_ps(beta2,rsq01);
1851             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
1852             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1853             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1854             felec            = _mm_mul_ps(qq01,felec);
1855
1856             fscal            = felec;
1857
1858             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1859
1860              /* Update vectorial force */
1861             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1862             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1863             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1864
1865             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1866             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1867             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1868
1869             /**************************
1870              * CALCULATE INTERACTIONS *
1871              **************************/
1872
1873             r02              = _mm_mul_ps(rsq02,rinv02);
1874             r02              = _mm_andnot_ps(dummy_mask,r02);
1875
1876             /* EWALD ELECTROSTATICS */
1877
1878             /* Analytical PME correction */
1879             zeta2            = _mm_mul_ps(beta2,rsq02);
1880             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
1881             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1882             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1883             felec            = _mm_mul_ps(qq02,felec);
1884
1885             fscal            = felec;
1886
1887             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1888
1889              /* Update vectorial force */
1890             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1891             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1892             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1893
1894             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1895             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1896             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1897
1898             /**************************
1899              * CALCULATE INTERACTIONS *
1900              **************************/
1901
1902             r10              = _mm_mul_ps(rsq10,rinv10);
1903             r10              = _mm_andnot_ps(dummy_mask,r10);
1904
1905             /* EWALD ELECTROSTATICS */
1906
1907             /* Analytical PME correction */
1908             zeta2            = _mm_mul_ps(beta2,rsq10);
1909             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1910             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1911             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1912             felec            = _mm_mul_ps(qq10,felec);
1913
1914             fscal            = felec;
1915
1916             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1917
1918              /* Update vectorial force */
1919             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1920             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1921             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1922
1923             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1924             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1925             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1926
1927             /**************************
1928              * CALCULATE INTERACTIONS *
1929              **************************/
1930
1931             r11              = _mm_mul_ps(rsq11,rinv11);
1932             r11              = _mm_andnot_ps(dummy_mask,r11);
1933
1934             /* EWALD ELECTROSTATICS */
1935
1936             /* Analytical PME correction */
1937             zeta2            = _mm_mul_ps(beta2,rsq11);
1938             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1939             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1940             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1941             felec            = _mm_mul_ps(qq11,felec);
1942
1943             fscal            = felec;
1944
1945             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1946
1947              /* Update vectorial force */
1948             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1949             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1950             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1951
1952             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1953             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1954             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1955
1956             /**************************
1957              * CALCULATE INTERACTIONS *
1958              **************************/
1959
1960             r12              = _mm_mul_ps(rsq12,rinv12);
1961             r12              = _mm_andnot_ps(dummy_mask,r12);
1962
1963             /* EWALD ELECTROSTATICS */
1964
1965             /* Analytical PME correction */
1966             zeta2            = _mm_mul_ps(beta2,rsq12);
1967             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1968             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1969             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1970             felec            = _mm_mul_ps(qq12,felec);
1971
1972             fscal            = felec;
1973
1974             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1975
1976              /* Update vectorial force */
1977             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1978             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1979             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1980
1981             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1982             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1983             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1984
1985             /**************************
1986              * CALCULATE INTERACTIONS *
1987              **************************/
1988
1989             r20              = _mm_mul_ps(rsq20,rinv20);
1990             r20              = _mm_andnot_ps(dummy_mask,r20);
1991
1992             /* EWALD ELECTROSTATICS */
1993
1994             /* Analytical PME correction */
1995             zeta2            = _mm_mul_ps(beta2,rsq20);
1996             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
1997             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1998             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1999             felec            = _mm_mul_ps(qq20,felec);
2000
2001             fscal            = felec;
2002
2003             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2004
2005              /* Update vectorial force */
2006             fix2             = _mm_macc_ps(dx20,fscal,fix2);
2007             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
2008             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
2009
2010             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
2011             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
2012             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
2013
2014             /**************************
2015              * CALCULATE INTERACTIONS *
2016              **************************/
2017
2018             r21              = _mm_mul_ps(rsq21,rinv21);
2019             r21              = _mm_andnot_ps(dummy_mask,r21);
2020
2021             /* EWALD ELECTROSTATICS */
2022
2023             /* Analytical PME correction */
2024             zeta2            = _mm_mul_ps(beta2,rsq21);
2025             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
2026             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2027             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2028             felec            = _mm_mul_ps(qq21,felec);
2029
2030             fscal            = felec;
2031
2032             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2033
2034              /* Update vectorial force */
2035             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2036             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2037             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2038
2039             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2040             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2041             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2042
2043             /**************************
2044              * CALCULATE INTERACTIONS *
2045              **************************/
2046
2047             r22              = _mm_mul_ps(rsq22,rinv22);
2048             r22              = _mm_andnot_ps(dummy_mask,r22);
2049
2050             /* EWALD ELECTROSTATICS */
2051
2052             /* Analytical PME correction */
2053             zeta2            = _mm_mul_ps(beta2,rsq22);
2054             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
2055             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2056             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2057             felec            = _mm_mul_ps(qq22,felec);
2058
2059             fscal            = felec;
2060
2061             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2062
2063              /* Update vectorial force */
2064             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2065             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2066             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2067
2068             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2069             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2070             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2071
2072             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2073             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2074             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2075             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2076
2077             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2078                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2079
2080             /* Inner loop uses 287 flops */
2081         }
2082
2083         /* End of innermost loop */
2084
2085         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2086                                               f+i_coord_offset,fshift+i_shift_offset);
2087
2088         /* Increment number of inner iterations */
2089         inneriter                  += j_index_end - j_index_start;
2090
2091         /* Outer loop uses 18 flops */
2092     }
2093
2094     /* Increment number of outer iterations */
2095     outeriter        += nri;
2096
2097     /* Update outer/inner flops */
2098
2099     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*287);
2100 }