d81061d117747f27c5a1c99d48c7edc095c75181
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEw_VdwLJEw_GeomW4W4_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_128_fma_single
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            LJEwald
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_VF_avx_128_fma_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwioffset3;
90     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
110     real             *charge;
111     int              nvdwtype;
112     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113     int              *vdwtype;
114     real             *vdwparam;
115     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
116     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
117     __m128           c6grid_00;
118     __m128           c6grid_11;
119     __m128           c6grid_12;
120     __m128           c6grid_13;
121     __m128           c6grid_21;
122     __m128           c6grid_22;
123     __m128           c6grid_23;
124     __m128           c6grid_31;
125     __m128           c6grid_32;
126     __m128           c6grid_33;
127     real             *vdwgridparam;
128     __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
129     __m128           one_half = _mm_set1_ps(0.5);
130     __m128           minus_one = _mm_set1_ps(-1.0);
131     __m128i          ewitab;
132     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
133     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
134     real             *ewtab;
135     __m128           dummy_mask,cutoff_mask;
136     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
137     __m128           one     = _mm_set1_ps(1.0);
138     __m128           two     = _mm_set1_ps(2.0);
139     x                = xx[0];
140     f                = ff[0];
141
142     nri              = nlist->nri;
143     iinr             = nlist->iinr;
144     jindex           = nlist->jindex;
145     jjnr             = nlist->jjnr;
146     shiftidx         = nlist->shift;
147     gid              = nlist->gid;
148     shiftvec         = fr->shift_vec[0];
149     fshift           = fr->fshift[0];
150     facel            = _mm_set1_ps(fr->epsfac);
151     charge           = mdatoms->chargeA;
152     nvdwtype         = fr->ntype;
153     vdwparam         = fr->nbfp;
154     vdwtype          = mdatoms->typeA;
155     vdwgridparam     = fr->ljpme_c6grid;
156     sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
157     ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
158     ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
159
160     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
161     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
162     beta2            = _mm_mul_ps(beta,beta);
163     beta3            = _mm_mul_ps(beta,beta2);
164     ewtab            = fr->ic->tabq_coul_FDV0;
165     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
166     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
167
168     /* Setup water-specific parameters */
169     inr              = nlist->iinr[0];
170     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
171     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
172     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
173     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
174
175     jq1              = _mm_set1_ps(charge[inr+1]);
176     jq2              = _mm_set1_ps(charge[inr+2]);
177     jq3              = _mm_set1_ps(charge[inr+3]);
178     vdwjidx0A        = 2*vdwtype[inr+0];
179     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
180     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
181     c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
182     qq11             = _mm_mul_ps(iq1,jq1);
183     qq12             = _mm_mul_ps(iq1,jq2);
184     qq13             = _mm_mul_ps(iq1,jq3);
185     qq21             = _mm_mul_ps(iq2,jq1);
186     qq22             = _mm_mul_ps(iq2,jq2);
187     qq23             = _mm_mul_ps(iq2,jq3);
188     qq31             = _mm_mul_ps(iq3,jq1);
189     qq32             = _mm_mul_ps(iq3,jq2);
190     qq33             = _mm_mul_ps(iq3,jq3);
191
192     /* Avoid stupid compiler warnings */
193     jnrA = jnrB = jnrC = jnrD = 0;
194     j_coord_offsetA = 0;
195     j_coord_offsetB = 0;
196     j_coord_offsetC = 0;
197     j_coord_offsetD = 0;
198
199     outeriter        = 0;
200     inneriter        = 0;
201
202     for(iidx=0;iidx<4*DIM;iidx++)
203     {
204         scratch[iidx] = 0.0;
205     }
206
207     /* Start outer loop over neighborlists */
208     for(iidx=0; iidx<nri; iidx++)
209     {
210         /* Load shift vector for this list */
211         i_shift_offset   = DIM*shiftidx[iidx];
212
213         /* Load limits for loop over neighbors */
214         j_index_start    = jindex[iidx];
215         j_index_end      = jindex[iidx+1];
216
217         /* Get outer coordinate index */
218         inr              = iinr[iidx];
219         i_coord_offset   = DIM*inr;
220
221         /* Load i particle coords and add shift vector */
222         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
223                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
224
225         fix0             = _mm_setzero_ps();
226         fiy0             = _mm_setzero_ps();
227         fiz0             = _mm_setzero_ps();
228         fix1             = _mm_setzero_ps();
229         fiy1             = _mm_setzero_ps();
230         fiz1             = _mm_setzero_ps();
231         fix2             = _mm_setzero_ps();
232         fiy2             = _mm_setzero_ps();
233         fiz2             = _mm_setzero_ps();
234         fix3             = _mm_setzero_ps();
235         fiy3             = _mm_setzero_ps();
236         fiz3             = _mm_setzero_ps();
237
238         /* Reset potential sums */
239         velecsum         = _mm_setzero_ps();
240         vvdwsum          = _mm_setzero_ps();
241
242         /* Start inner kernel loop */
243         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
244         {
245
246             /* Get j neighbor index, and coordinate index */
247             jnrA             = jjnr[jidx];
248             jnrB             = jjnr[jidx+1];
249             jnrC             = jjnr[jidx+2];
250             jnrD             = jjnr[jidx+3];
251             j_coord_offsetA  = DIM*jnrA;
252             j_coord_offsetB  = DIM*jnrB;
253             j_coord_offsetC  = DIM*jnrC;
254             j_coord_offsetD  = DIM*jnrD;
255
256             /* load j atom coordinates */
257             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
258                                               x+j_coord_offsetC,x+j_coord_offsetD,
259                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
260                                               &jy2,&jz2,&jx3,&jy3,&jz3);
261
262             /* Calculate displacement vector */
263             dx00             = _mm_sub_ps(ix0,jx0);
264             dy00             = _mm_sub_ps(iy0,jy0);
265             dz00             = _mm_sub_ps(iz0,jz0);
266             dx11             = _mm_sub_ps(ix1,jx1);
267             dy11             = _mm_sub_ps(iy1,jy1);
268             dz11             = _mm_sub_ps(iz1,jz1);
269             dx12             = _mm_sub_ps(ix1,jx2);
270             dy12             = _mm_sub_ps(iy1,jy2);
271             dz12             = _mm_sub_ps(iz1,jz2);
272             dx13             = _mm_sub_ps(ix1,jx3);
273             dy13             = _mm_sub_ps(iy1,jy3);
274             dz13             = _mm_sub_ps(iz1,jz3);
275             dx21             = _mm_sub_ps(ix2,jx1);
276             dy21             = _mm_sub_ps(iy2,jy1);
277             dz21             = _mm_sub_ps(iz2,jz1);
278             dx22             = _mm_sub_ps(ix2,jx2);
279             dy22             = _mm_sub_ps(iy2,jy2);
280             dz22             = _mm_sub_ps(iz2,jz2);
281             dx23             = _mm_sub_ps(ix2,jx3);
282             dy23             = _mm_sub_ps(iy2,jy3);
283             dz23             = _mm_sub_ps(iz2,jz3);
284             dx31             = _mm_sub_ps(ix3,jx1);
285             dy31             = _mm_sub_ps(iy3,jy1);
286             dz31             = _mm_sub_ps(iz3,jz1);
287             dx32             = _mm_sub_ps(ix3,jx2);
288             dy32             = _mm_sub_ps(iy3,jy2);
289             dz32             = _mm_sub_ps(iz3,jz2);
290             dx33             = _mm_sub_ps(ix3,jx3);
291             dy33             = _mm_sub_ps(iy3,jy3);
292             dz33             = _mm_sub_ps(iz3,jz3);
293
294             /* Calculate squared distance and things based on it */
295             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
296             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
297             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
298             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
299             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
300             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
301             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
302             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
303             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
304             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
305
306             rinv00           = gmx_mm_invsqrt_ps(rsq00);
307             rinv11           = gmx_mm_invsqrt_ps(rsq11);
308             rinv12           = gmx_mm_invsqrt_ps(rsq12);
309             rinv13           = gmx_mm_invsqrt_ps(rsq13);
310             rinv21           = gmx_mm_invsqrt_ps(rsq21);
311             rinv22           = gmx_mm_invsqrt_ps(rsq22);
312             rinv23           = gmx_mm_invsqrt_ps(rsq23);
313             rinv31           = gmx_mm_invsqrt_ps(rsq31);
314             rinv32           = gmx_mm_invsqrt_ps(rsq32);
315             rinv33           = gmx_mm_invsqrt_ps(rsq33);
316
317             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
318             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
319             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
320             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
321             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
322             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
323             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
324             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
325             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
326             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
327
328             fjx0             = _mm_setzero_ps();
329             fjy0             = _mm_setzero_ps();
330             fjz0             = _mm_setzero_ps();
331             fjx1             = _mm_setzero_ps();
332             fjy1             = _mm_setzero_ps();
333             fjz1             = _mm_setzero_ps();
334             fjx2             = _mm_setzero_ps();
335             fjy2             = _mm_setzero_ps();
336             fjz2             = _mm_setzero_ps();
337             fjx3             = _mm_setzero_ps();
338             fjy3             = _mm_setzero_ps();
339             fjz3             = _mm_setzero_ps();
340
341             /**************************
342              * CALCULATE INTERACTIONS *
343              **************************/
344
345             r00              = _mm_mul_ps(rsq00,rinv00);
346
347             /* Analytical LJ-PME */
348             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
349             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
350             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
351             exponent         = gmx_simd_exp_r(ewcljrsq);
352             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
353             poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
354             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
355             vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
356             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
357             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
358             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
359             fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
360
361             /* Update potential sum for this i atom from the interaction with this j atom. */
362             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
363
364             fscal            = fvdw;
365
366              /* Update vectorial force */
367             fix0             = _mm_macc_ps(dx00,fscal,fix0);
368             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
369             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
370
371             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
372             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
373             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
374
375             /**************************
376              * CALCULATE INTERACTIONS *
377              **************************/
378
379             r11              = _mm_mul_ps(rsq11,rinv11);
380
381             /* EWALD ELECTROSTATICS */
382
383             /* Analytical PME correction */
384             zeta2            = _mm_mul_ps(beta2,rsq11);
385             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
386             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
387             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
388             felec            = _mm_mul_ps(qq11,felec);
389             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
390             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
391             velec            = _mm_mul_ps(qq11,velec);
392
393             /* Update potential sum for this i atom from the interaction with this j atom. */
394             velecsum         = _mm_add_ps(velecsum,velec);
395
396             fscal            = felec;
397
398              /* Update vectorial force */
399             fix1             = _mm_macc_ps(dx11,fscal,fix1);
400             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
401             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
402
403             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
404             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
405             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
406
407             /**************************
408              * CALCULATE INTERACTIONS *
409              **************************/
410
411             r12              = _mm_mul_ps(rsq12,rinv12);
412
413             /* EWALD ELECTROSTATICS */
414
415             /* Analytical PME correction */
416             zeta2            = _mm_mul_ps(beta2,rsq12);
417             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
418             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
419             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
420             felec            = _mm_mul_ps(qq12,felec);
421             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
422             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
423             velec            = _mm_mul_ps(qq12,velec);
424
425             /* Update potential sum for this i atom from the interaction with this j atom. */
426             velecsum         = _mm_add_ps(velecsum,velec);
427
428             fscal            = felec;
429
430              /* Update vectorial force */
431             fix1             = _mm_macc_ps(dx12,fscal,fix1);
432             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
433             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
434
435             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
436             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
437             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
438
439             /**************************
440              * CALCULATE INTERACTIONS *
441              **************************/
442
443             r13              = _mm_mul_ps(rsq13,rinv13);
444
445             /* EWALD ELECTROSTATICS */
446
447             /* Analytical PME correction */
448             zeta2            = _mm_mul_ps(beta2,rsq13);
449             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
450             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
451             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
452             felec            = _mm_mul_ps(qq13,felec);
453             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
454             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv13);
455             velec            = _mm_mul_ps(qq13,velec);
456
457             /* Update potential sum for this i atom from the interaction with this j atom. */
458             velecsum         = _mm_add_ps(velecsum,velec);
459
460             fscal            = felec;
461
462              /* Update vectorial force */
463             fix1             = _mm_macc_ps(dx13,fscal,fix1);
464             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
465             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
466
467             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
468             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
469             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
470
471             /**************************
472              * CALCULATE INTERACTIONS *
473              **************************/
474
475             r21              = _mm_mul_ps(rsq21,rinv21);
476
477             /* EWALD ELECTROSTATICS */
478
479             /* Analytical PME correction */
480             zeta2            = _mm_mul_ps(beta2,rsq21);
481             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
482             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
483             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
484             felec            = _mm_mul_ps(qq21,felec);
485             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
486             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
487             velec            = _mm_mul_ps(qq21,velec);
488
489             /* Update potential sum for this i atom from the interaction with this j atom. */
490             velecsum         = _mm_add_ps(velecsum,velec);
491
492             fscal            = felec;
493
494              /* Update vectorial force */
495             fix2             = _mm_macc_ps(dx21,fscal,fix2);
496             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
497             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
498
499             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
500             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
501             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
502
503             /**************************
504              * CALCULATE INTERACTIONS *
505              **************************/
506
507             r22              = _mm_mul_ps(rsq22,rinv22);
508
509             /* EWALD ELECTROSTATICS */
510
511             /* Analytical PME correction */
512             zeta2            = _mm_mul_ps(beta2,rsq22);
513             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
514             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
515             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
516             felec            = _mm_mul_ps(qq22,felec);
517             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
518             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
519             velec            = _mm_mul_ps(qq22,velec);
520
521             /* Update potential sum for this i atom from the interaction with this j atom. */
522             velecsum         = _mm_add_ps(velecsum,velec);
523
524             fscal            = felec;
525
526              /* Update vectorial force */
527             fix2             = _mm_macc_ps(dx22,fscal,fix2);
528             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
529             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
530
531             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
532             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
533             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
534
535             /**************************
536              * CALCULATE INTERACTIONS *
537              **************************/
538
539             r23              = _mm_mul_ps(rsq23,rinv23);
540
541             /* EWALD ELECTROSTATICS */
542
543             /* Analytical PME correction */
544             zeta2            = _mm_mul_ps(beta2,rsq23);
545             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
546             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
547             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
548             felec            = _mm_mul_ps(qq23,felec);
549             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
550             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv23);
551             velec            = _mm_mul_ps(qq23,velec);
552
553             /* Update potential sum for this i atom from the interaction with this j atom. */
554             velecsum         = _mm_add_ps(velecsum,velec);
555
556             fscal            = felec;
557
558              /* Update vectorial force */
559             fix2             = _mm_macc_ps(dx23,fscal,fix2);
560             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
561             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
562
563             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
564             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
565             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
566
567             /**************************
568              * CALCULATE INTERACTIONS *
569              **************************/
570
571             r31              = _mm_mul_ps(rsq31,rinv31);
572
573             /* EWALD ELECTROSTATICS */
574
575             /* Analytical PME correction */
576             zeta2            = _mm_mul_ps(beta2,rsq31);
577             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
578             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
579             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
580             felec            = _mm_mul_ps(qq31,felec);
581             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
582             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv31);
583             velec            = _mm_mul_ps(qq31,velec);
584
585             /* Update potential sum for this i atom from the interaction with this j atom. */
586             velecsum         = _mm_add_ps(velecsum,velec);
587
588             fscal            = felec;
589
590              /* Update vectorial force */
591             fix3             = _mm_macc_ps(dx31,fscal,fix3);
592             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
593             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
594
595             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
596             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
597             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
598
599             /**************************
600              * CALCULATE INTERACTIONS *
601              **************************/
602
603             r32              = _mm_mul_ps(rsq32,rinv32);
604
605             /* EWALD ELECTROSTATICS */
606
607             /* Analytical PME correction */
608             zeta2            = _mm_mul_ps(beta2,rsq32);
609             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
610             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
611             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
612             felec            = _mm_mul_ps(qq32,felec);
613             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
614             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv32);
615             velec            = _mm_mul_ps(qq32,velec);
616
617             /* Update potential sum for this i atom from the interaction with this j atom. */
618             velecsum         = _mm_add_ps(velecsum,velec);
619
620             fscal            = felec;
621
622              /* Update vectorial force */
623             fix3             = _mm_macc_ps(dx32,fscal,fix3);
624             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
625             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
626
627             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
628             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
629             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
630
631             /**************************
632              * CALCULATE INTERACTIONS *
633              **************************/
634
635             r33              = _mm_mul_ps(rsq33,rinv33);
636
637             /* EWALD ELECTROSTATICS */
638
639             /* Analytical PME correction */
640             zeta2            = _mm_mul_ps(beta2,rsq33);
641             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
642             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
643             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
644             felec            = _mm_mul_ps(qq33,felec);
645             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
646             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv33);
647             velec            = _mm_mul_ps(qq33,velec);
648
649             /* Update potential sum for this i atom from the interaction with this j atom. */
650             velecsum         = _mm_add_ps(velecsum,velec);
651
652             fscal            = felec;
653
654              /* Update vectorial force */
655             fix3             = _mm_macc_ps(dx33,fscal,fix3);
656             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
657             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
658
659             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
660             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
661             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
662
663             fjptrA             = f+j_coord_offsetA;
664             fjptrB             = f+j_coord_offsetB;
665             fjptrC             = f+j_coord_offsetC;
666             fjptrD             = f+j_coord_offsetD;
667
668             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
669                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
670                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
671
672             /* Inner loop uses 314 flops */
673         }
674
675         if(jidx<j_index_end)
676         {
677
678             /* Get j neighbor index, and coordinate index */
679             jnrlistA         = jjnr[jidx];
680             jnrlistB         = jjnr[jidx+1];
681             jnrlistC         = jjnr[jidx+2];
682             jnrlistD         = jjnr[jidx+3];
683             /* Sign of each element will be negative for non-real atoms.
684              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
685              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
686              */
687             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
688             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
689             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
690             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
691             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
692             j_coord_offsetA  = DIM*jnrA;
693             j_coord_offsetB  = DIM*jnrB;
694             j_coord_offsetC  = DIM*jnrC;
695             j_coord_offsetD  = DIM*jnrD;
696
697             /* load j atom coordinates */
698             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
699                                               x+j_coord_offsetC,x+j_coord_offsetD,
700                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
701                                               &jy2,&jz2,&jx3,&jy3,&jz3);
702
703             /* Calculate displacement vector */
704             dx00             = _mm_sub_ps(ix0,jx0);
705             dy00             = _mm_sub_ps(iy0,jy0);
706             dz00             = _mm_sub_ps(iz0,jz0);
707             dx11             = _mm_sub_ps(ix1,jx1);
708             dy11             = _mm_sub_ps(iy1,jy1);
709             dz11             = _mm_sub_ps(iz1,jz1);
710             dx12             = _mm_sub_ps(ix1,jx2);
711             dy12             = _mm_sub_ps(iy1,jy2);
712             dz12             = _mm_sub_ps(iz1,jz2);
713             dx13             = _mm_sub_ps(ix1,jx3);
714             dy13             = _mm_sub_ps(iy1,jy3);
715             dz13             = _mm_sub_ps(iz1,jz3);
716             dx21             = _mm_sub_ps(ix2,jx1);
717             dy21             = _mm_sub_ps(iy2,jy1);
718             dz21             = _mm_sub_ps(iz2,jz1);
719             dx22             = _mm_sub_ps(ix2,jx2);
720             dy22             = _mm_sub_ps(iy2,jy2);
721             dz22             = _mm_sub_ps(iz2,jz2);
722             dx23             = _mm_sub_ps(ix2,jx3);
723             dy23             = _mm_sub_ps(iy2,jy3);
724             dz23             = _mm_sub_ps(iz2,jz3);
725             dx31             = _mm_sub_ps(ix3,jx1);
726             dy31             = _mm_sub_ps(iy3,jy1);
727             dz31             = _mm_sub_ps(iz3,jz1);
728             dx32             = _mm_sub_ps(ix3,jx2);
729             dy32             = _mm_sub_ps(iy3,jy2);
730             dz32             = _mm_sub_ps(iz3,jz2);
731             dx33             = _mm_sub_ps(ix3,jx3);
732             dy33             = _mm_sub_ps(iy3,jy3);
733             dz33             = _mm_sub_ps(iz3,jz3);
734
735             /* Calculate squared distance and things based on it */
736             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
737             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
738             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
739             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
740             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
741             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
742             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
743             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
744             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
745             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
746
747             rinv00           = gmx_mm_invsqrt_ps(rsq00);
748             rinv11           = gmx_mm_invsqrt_ps(rsq11);
749             rinv12           = gmx_mm_invsqrt_ps(rsq12);
750             rinv13           = gmx_mm_invsqrt_ps(rsq13);
751             rinv21           = gmx_mm_invsqrt_ps(rsq21);
752             rinv22           = gmx_mm_invsqrt_ps(rsq22);
753             rinv23           = gmx_mm_invsqrt_ps(rsq23);
754             rinv31           = gmx_mm_invsqrt_ps(rsq31);
755             rinv32           = gmx_mm_invsqrt_ps(rsq32);
756             rinv33           = gmx_mm_invsqrt_ps(rsq33);
757
758             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
759             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
760             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
761             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
762             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
763             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
764             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
765             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
766             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
767             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
768
769             fjx0             = _mm_setzero_ps();
770             fjy0             = _mm_setzero_ps();
771             fjz0             = _mm_setzero_ps();
772             fjx1             = _mm_setzero_ps();
773             fjy1             = _mm_setzero_ps();
774             fjz1             = _mm_setzero_ps();
775             fjx2             = _mm_setzero_ps();
776             fjy2             = _mm_setzero_ps();
777             fjz2             = _mm_setzero_ps();
778             fjx3             = _mm_setzero_ps();
779             fjy3             = _mm_setzero_ps();
780             fjz3             = _mm_setzero_ps();
781
782             /**************************
783              * CALCULATE INTERACTIONS *
784              **************************/
785
786             r00              = _mm_mul_ps(rsq00,rinv00);
787             r00              = _mm_andnot_ps(dummy_mask,r00);
788
789             /* Analytical LJ-PME */
790             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
791             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
792             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
793             exponent         = gmx_simd_exp_r(ewcljrsq);
794             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
795             poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
796             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
797             vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
798             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
799             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
800             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
801             fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
802
803             /* Update potential sum for this i atom from the interaction with this j atom. */
804             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
805             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
806
807             fscal            = fvdw;
808
809             fscal            = _mm_andnot_ps(dummy_mask,fscal);
810
811              /* Update vectorial force */
812             fix0             = _mm_macc_ps(dx00,fscal,fix0);
813             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
814             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
815
816             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
817             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
818             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
819
820             /**************************
821              * CALCULATE INTERACTIONS *
822              **************************/
823
824             r11              = _mm_mul_ps(rsq11,rinv11);
825             r11              = _mm_andnot_ps(dummy_mask,r11);
826
827             /* EWALD ELECTROSTATICS */
828
829             /* Analytical PME correction */
830             zeta2            = _mm_mul_ps(beta2,rsq11);
831             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
832             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
833             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
834             felec            = _mm_mul_ps(qq11,felec);
835             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
836             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
837             velec            = _mm_mul_ps(qq11,velec);
838
839             /* Update potential sum for this i atom from the interaction with this j atom. */
840             velec            = _mm_andnot_ps(dummy_mask,velec);
841             velecsum         = _mm_add_ps(velecsum,velec);
842
843             fscal            = felec;
844
845             fscal            = _mm_andnot_ps(dummy_mask,fscal);
846
847              /* Update vectorial force */
848             fix1             = _mm_macc_ps(dx11,fscal,fix1);
849             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
850             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
851
852             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
853             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
854             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
855
856             /**************************
857              * CALCULATE INTERACTIONS *
858              **************************/
859
860             r12              = _mm_mul_ps(rsq12,rinv12);
861             r12              = _mm_andnot_ps(dummy_mask,r12);
862
863             /* EWALD ELECTROSTATICS */
864
865             /* Analytical PME correction */
866             zeta2            = _mm_mul_ps(beta2,rsq12);
867             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
868             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
869             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
870             felec            = _mm_mul_ps(qq12,felec);
871             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
872             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
873             velec            = _mm_mul_ps(qq12,velec);
874
875             /* Update potential sum for this i atom from the interaction with this j atom. */
876             velec            = _mm_andnot_ps(dummy_mask,velec);
877             velecsum         = _mm_add_ps(velecsum,velec);
878
879             fscal            = felec;
880
881             fscal            = _mm_andnot_ps(dummy_mask,fscal);
882
883              /* Update vectorial force */
884             fix1             = _mm_macc_ps(dx12,fscal,fix1);
885             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
886             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
887
888             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
889             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
890             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
891
892             /**************************
893              * CALCULATE INTERACTIONS *
894              **************************/
895
896             r13              = _mm_mul_ps(rsq13,rinv13);
897             r13              = _mm_andnot_ps(dummy_mask,r13);
898
899             /* EWALD ELECTROSTATICS */
900
901             /* Analytical PME correction */
902             zeta2            = _mm_mul_ps(beta2,rsq13);
903             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
904             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
905             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
906             felec            = _mm_mul_ps(qq13,felec);
907             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
908             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv13);
909             velec            = _mm_mul_ps(qq13,velec);
910
911             /* Update potential sum for this i atom from the interaction with this j atom. */
912             velec            = _mm_andnot_ps(dummy_mask,velec);
913             velecsum         = _mm_add_ps(velecsum,velec);
914
915             fscal            = felec;
916
917             fscal            = _mm_andnot_ps(dummy_mask,fscal);
918
919              /* Update vectorial force */
920             fix1             = _mm_macc_ps(dx13,fscal,fix1);
921             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
922             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
923
924             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
925             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
926             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
927
928             /**************************
929              * CALCULATE INTERACTIONS *
930              **************************/
931
932             r21              = _mm_mul_ps(rsq21,rinv21);
933             r21              = _mm_andnot_ps(dummy_mask,r21);
934
935             /* EWALD ELECTROSTATICS */
936
937             /* Analytical PME correction */
938             zeta2            = _mm_mul_ps(beta2,rsq21);
939             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
940             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
941             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
942             felec            = _mm_mul_ps(qq21,felec);
943             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
944             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
945             velec            = _mm_mul_ps(qq21,velec);
946
947             /* Update potential sum for this i atom from the interaction with this j atom. */
948             velec            = _mm_andnot_ps(dummy_mask,velec);
949             velecsum         = _mm_add_ps(velecsum,velec);
950
951             fscal            = felec;
952
953             fscal            = _mm_andnot_ps(dummy_mask,fscal);
954
955              /* Update vectorial force */
956             fix2             = _mm_macc_ps(dx21,fscal,fix2);
957             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
958             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
959
960             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
961             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
962             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
963
964             /**************************
965              * CALCULATE INTERACTIONS *
966              **************************/
967
968             r22              = _mm_mul_ps(rsq22,rinv22);
969             r22              = _mm_andnot_ps(dummy_mask,r22);
970
971             /* EWALD ELECTROSTATICS */
972
973             /* Analytical PME correction */
974             zeta2            = _mm_mul_ps(beta2,rsq22);
975             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
976             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
977             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
978             felec            = _mm_mul_ps(qq22,felec);
979             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
980             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
981             velec            = _mm_mul_ps(qq22,velec);
982
983             /* Update potential sum for this i atom from the interaction with this j atom. */
984             velec            = _mm_andnot_ps(dummy_mask,velec);
985             velecsum         = _mm_add_ps(velecsum,velec);
986
987             fscal            = felec;
988
989             fscal            = _mm_andnot_ps(dummy_mask,fscal);
990
991              /* Update vectorial force */
992             fix2             = _mm_macc_ps(dx22,fscal,fix2);
993             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
994             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
995
996             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
997             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
998             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
999
1000             /**************************
1001              * CALCULATE INTERACTIONS *
1002              **************************/
1003
1004             r23              = _mm_mul_ps(rsq23,rinv23);
1005             r23              = _mm_andnot_ps(dummy_mask,r23);
1006
1007             /* EWALD ELECTROSTATICS */
1008
1009             /* Analytical PME correction */
1010             zeta2            = _mm_mul_ps(beta2,rsq23);
1011             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
1012             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1013             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1014             felec            = _mm_mul_ps(qq23,felec);
1015             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1016             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv23);
1017             velec            = _mm_mul_ps(qq23,velec);
1018
1019             /* Update potential sum for this i atom from the interaction with this j atom. */
1020             velec            = _mm_andnot_ps(dummy_mask,velec);
1021             velecsum         = _mm_add_ps(velecsum,velec);
1022
1023             fscal            = felec;
1024
1025             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1026
1027              /* Update vectorial force */
1028             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1029             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1030             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1031
1032             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1033             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1034             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1035
1036             /**************************
1037              * CALCULATE INTERACTIONS *
1038              **************************/
1039
1040             r31              = _mm_mul_ps(rsq31,rinv31);
1041             r31              = _mm_andnot_ps(dummy_mask,r31);
1042
1043             /* EWALD ELECTROSTATICS */
1044
1045             /* Analytical PME correction */
1046             zeta2            = _mm_mul_ps(beta2,rsq31);
1047             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
1048             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1049             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1050             felec            = _mm_mul_ps(qq31,felec);
1051             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1052             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv31);
1053             velec            = _mm_mul_ps(qq31,velec);
1054
1055             /* Update potential sum for this i atom from the interaction with this j atom. */
1056             velec            = _mm_andnot_ps(dummy_mask,velec);
1057             velecsum         = _mm_add_ps(velecsum,velec);
1058
1059             fscal            = felec;
1060
1061             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1062
1063              /* Update vectorial force */
1064             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1065             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1066             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1067
1068             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1069             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1070             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1071
1072             /**************************
1073              * CALCULATE INTERACTIONS *
1074              **************************/
1075
1076             r32              = _mm_mul_ps(rsq32,rinv32);
1077             r32              = _mm_andnot_ps(dummy_mask,r32);
1078
1079             /* EWALD ELECTROSTATICS */
1080
1081             /* Analytical PME correction */
1082             zeta2            = _mm_mul_ps(beta2,rsq32);
1083             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
1084             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1085             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1086             felec            = _mm_mul_ps(qq32,felec);
1087             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1088             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv32);
1089             velec            = _mm_mul_ps(qq32,velec);
1090
1091             /* Update potential sum for this i atom from the interaction with this j atom. */
1092             velec            = _mm_andnot_ps(dummy_mask,velec);
1093             velecsum         = _mm_add_ps(velecsum,velec);
1094
1095             fscal            = felec;
1096
1097             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1098
1099              /* Update vectorial force */
1100             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1101             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1102             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1103
1104             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1105             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1106             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1107
1108             /**************************
1109              * CALCULATE INTERACTIONS *
1110              **************************/
1111
1112             r33              = _mm_mul_ps(rsq33,rinv33);
1113             r33              = _mm_andnot_ps(dummy_mask,r33);
1114
1115             /* EWALD ELECTROSTATICS */
1116
1117             /* Analytical PME correction */
1118             zeta2            = _mm_mul_ps(beta2,rsq33);
1119             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
1120             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1121             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1122             felec            = _mm_mul_ps(qq33,felec);
1123             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1124             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv33);
1125             velec            = _mm_mul_ps(qq33,velec);
1126
1127             /* Update potential sum for this i atom from the interaction with this j atom. */
1128             velec            = _mm_andnot_ps(dummy_mask,velec);
1129             velecsum         = _mm_add_ps(velecsum,velec);
1130
1131             fscal            = felec;
1132
1133             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1134
1135              /* Update vectorial force */
1136             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1137             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1138             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1139
1140             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1141             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1142             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1143
1144             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1145             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1146             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1147             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1148
1149             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1150                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1151                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1152
1153             /* Inner loop uses 324 flops */
1154         }
1155
1156         /* End of innermost loop */
1157
1158         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1159                                               f+i_coord_offset,fshift+i_shift_offset);
1160
1161         ggid                        = gid[iidx];
1162         /* Update potential energies */
1163         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1164         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1165
1166         /* Increment number of inner iterations */
1167         inneriter                  += j_index_end - j_index_start;
1168
1169         /* Outer loop uses 26 flops */
1170     }
1171
1172     /* Increment number of outer iterations */
1173     outeriter        += nri;
1174
1175     /* Update outer/inner flops */
1176
1177     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*324);
1178 }
1179 /*
1180  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_128_fma_single
1181  * Electrostatics interaction: Ewald
1182  * VdW interaction:            LJEwald
1183  * Geometry:                   Water4-Water4
1184  * Calculate force/pot:        Force
1185  */
1186 void
1187 nb_kernel_ElecEw_VdwLJEw_GeomW4W4_F_avx_128_fma_single
1188                     (t_nblist                    * gmx_restrict       nlist,
1189                      rvec                        * gmx_restrict          xx,
1190                      rvec                        * gmx_restrict          ff,
1191                      t_forcerec                  * gmx_restrict          fr,
1192                      t_mdatoms                   * gmx_restrict     mdatoms,
1193                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1194                      t_nrnb                      * gmx_restrict        nrnb)
1195 {
1196     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1197      * just 0 for non-waters.
1198      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1199      * jnr indices corresponding to data put in the four positions in the SIMD register.
1200      */
1201     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1202     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1203     int              jnrA,jnrB,jnrC,jnrD;
1204     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1205     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1206     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1207     real             rcutoff_scalar;
1208     real             *shiftvec,*fshift,*x,*f;
1209     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1210     real             scratch[4*DIM];
1211     __m128           fscal,rcutoff,rcutoff2,jidxall;
1212     int              vdwioffset0;
1213     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1214     int              vdwioffset1;
1215     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1216     int              vdwioffset2;
1217     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1218     int              vdwioffset3;
1219     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1220     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1221     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1222     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1223     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1224     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1225     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1226     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1227     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1228     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1229     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1230     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1231     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1232     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1233     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1234     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1235     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1236     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1237     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1238     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1239     real             *charge;
1240     int              nvdwtype;
1241     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1242     int              *vdwtype;
1243     real             *vdwparam;
1244     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1245     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1246     __m128           c6grid_00;
1247     __m128           c6grid_11;
1248     __m128           c6grid_12;
1249     __m128           c6grid_13;
1250     __m128           c6grid_21;
1251     __m128           c6grid_22;
1252     __m128           c6grid_23;
1253     __m128           c6grid_31;
1254     __m128           c6grid_32;
1255     __m128           c6grid_33;
1256     real             *vdwgridparam;
1257     __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1258     __m128           one_half = _mm_set1_ps(0.5);
1259     __m128           minus_one = _mm_set1_ps(-1.0);
1260     __m128i          ewitab;
1261     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1262     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1263     real             *ewtab;
1264     __m128           dummy_mask,cutoff_mask;
1265     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1266     __m128           one     = _mm_set1_ps(1.0);
1267     __m128           two     = _mm_set1_ps(2.0);
1268     x                = xx[0];
1269     f                = ff[0];
1270
1271     nri              = nlist->nri;
1272     iinr             = nlist->iinr;
1273     jindex           = nlist->jindex;
1274     jjnr             = nlist->jjnr;
1275     shiftidx         = nlist->shift;
1276     gid              = nlist->gid;
1277     shiftvec         = fr->shift_vec[0];
1278     fshift           = fr->fshift[0];
1279     facel            = _mm_set1_ps(fr->epsfac);
1280     charge           = mdatoms->chargeA;
1281     nvdwtype         = fr->ntype;
1282     vdwparam         = fr->nbfp;
1283     vdwtype          = mdatoms->typeA;
1284     vdwgridparam     = fr->ljpme_c6grid;
1285     sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
1286     ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
1287     ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
1288
1289     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1290     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1291     beta2            = _mm_mul_ps(beta,beta);
1292     beta3            = _mm_mul_ps(beta,beta2);
1293     ewtab            = fr->ic->tabq_coul_F;
1294     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1295     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1296
1297     /* Setup water-specific parameters */
1298     inr              = nlist->iinr[0];
1299     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1300     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1301     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1302     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1303
1304     jq1              = _mm_set1_ps(charge[inr+1]);
1305     jq2              = _mm_set1_ps(charge[inr+2]);
1306     jq3              = _mm_set1_ps(charge[inr+3]);
1307     vdwjidx0A        = 2*vdwtype[inr+0];
1308     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1309     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1310     c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
1311     qq11             = _mm_mul_ps(iq1,jq1);
1312     qq12             = _mm_mul_ps(iq1,jq2);
1313     qq13             = _mm_mul_ps(iq1,jq3);
1314     qq21             = _mm_mul_ps(iq2,jq1);
1315     qq22             = _mm_mul_ps(iq2,jq2);
1316     qq23             = _mm_mul_ps(iq2,jq3);
1317     qq31             = _mm_mul_ps(iq3,jq1);
1318     qq32             = _mm_mul_ps(iq3,jq2);
1319     qq33             = _mm_mul_ps(iq3,jq3);
1320
1321     /* Avoid stupid compiler warnings */
1322     jnrA = jnrB = jnrC = jnrD = 0;
1323     j_coord_offsetA = 0;
1324     j_coord_offsetB = 0;
1325     j_coord_offsetC = 0;
1326     j_coord_offsetD = 0;
1327
1328     outeriter        = 0;
1329     inneriter        = 0;
1330
1331     for(iidx=0;iidx<4*DIM;iidx++)
1332     {
1333         scratch[iidx] = 0.0;
1334     }
1335
1336     /* Start outer loop over neighborlists */
1337     for(iidx=0; iidx<nri; iidx++)
1338     {
1339         /* Load shift vector for this list */
1340         i_shift_offset   = DIM*shiftidx[iidx];
1341
1342         /* Load limits for loop over neighbors */
1343         j_index_start    = jindex[iidx];
1344         j_index_end      = jindex[iidx+1];
1345
1346         /* Get outer coordinate index */
1347         inr              = iinr[iidx];
1348         i_coord_offset   = DIM*inr;
1349
1350         /* Load i particle coords and add shift vector */
1351         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1352                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1353
1354         fix0             = _mm_setzero_ps();
1355         fiy0             = _mm_setzero_ps();
1356         fiz0             = _mm_setzero_ps();
1357         fix1             = _mm_setzero_ps();
1358         fiy1             = _mm_setzero_ps();
1359         fiz1             = _mm_setzero_ps();
1360         fix2             = _mm_setzero_ps();
1361         fiy2             = _mm_setzero_ps();
1362         fiz2             = _mm_setzero_ps();
1363         fix3             = _mm_setzero_ps();
1364         fiy3             = _mm_setzero_ps();
1365         fiz3             = _mm_setzero_ps();
1366
1367         /* Start inner kernel loop */
1368         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1369         {
1370
1371             /* Get j neighbor index, and coordinate index */
1372             jnrA             = jjnr[jidx];
1373             jnrB             = jjnr[jidx+1];
1374             jnrC             = jjnr[jidx+2];
1375             jnrD             = jjnr[jidx+3];
1376             j_coord_offsetA  = DIM*jnrA;
1377             j_coord_offsetB  = DIM*jnrB;
1378             j_coord_offsetC  = DIM*jnrC;
1379             j_coord_offsetD  = DIM*jnrD;
1380
1381             /* load j atom coordinates */
1382             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1383                                               x+j_coord_offsetC,x+j_coord_offsetD,
1384                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1385                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1386
1387             /* Calculate displacement vector */
1388             dx00             = _mm_sub_ps(ix0,jx0);
1389             dy00             = _mm_sub_ps(iy0,jy0);
1390             dz00             = _mm_sub_ps(iz0,jz0);
1391             dx11             = _mm_sub_ps(ix1,jx1);
1392             dy11             = _mm_sub_ps(iy1,jy1);
1393             dz11             = _mm_sub_ps(iz1,jz1);
1394             dx12             = _mm_sub_ps(ix1,jx2);
1395             dy12             = _mm_sub_ps(iy1,jy2);
1396             dz12             = _mm_sub_ps(iz1,jz2);
1397             dx13             = _mm_sub_ps(ix1,jx3);
1398             dy13             = _mm_sub_ps(iy1,jy3);
1399             dz13             = _mm_sub_ps(iz1,jz3);
1400             dx21             = _mm_sub_ps(ix2,jx1);
1401             dy21             = _mm_sub_ps(iy2,jy1);
1402             dz21             = _mm_sub_ps(iz2,jz1);
1403             dx22             = _mm_sub_ps(ix2,jx2);
1404             dy22             = _mm_sub_ps(iy2,jy2);
1405             dz22             = _mm_sub_ps(iz2,jz2);
1406             dx23             = _mm_sub_ps(ix2,jx3);
1407             dy23             = _mm_sub_ps(iy2,jy3);
1408             dz23             = _mm_sub_ps(iz2,jz3);
1409             dx31             = _mm_sub_ps(ix3,jx1);
1410             dy31             = _mm_sub_ps(iy3,jy1);
1411             dz31             = _mm_sub_ps(iz3,jz1);
1412             dx32             = _mm_sub_ps(ix3,jx2);
1413             dy32             = _mm_sub_ps(iy3,jy2);
1414             dz32             = _mm_sub_ps(iz3,jz2);
1415             dx33             = _mm_sub_ps(ix3,jx3);
1416             dy33             = _mm_sub_ps(iy3,jy3);
1417             dz33             = _mm_sub_ps(iz3,jz3);
1418
1419             /* Calculate squared distance and things based on it */
1420             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1421             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1422             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1423             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1424             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1425             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1426             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1427             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1428             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1429             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1430
1431             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1432             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1433             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1434             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1435             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1436             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1437             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1438             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1439             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1440             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1441
1442             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1443             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1444             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1445             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1446             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1447             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1448             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1449             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1450             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1451             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1452
1453             fjx0             = _mm_setzero_ps();
1454             fjy0             = _mm_setzero_ps();
1455             fjz0             = _mm_setzero_ps();
1456             fjx1             = _mm_setzero_ps();
1457             fjy1             = _mm_setzero_ps();
1458             fjz1             = _mm_setzero_ps();
1459             fjx2             = _mm_setzero_ps();
1460             fjy2             = _mm_setzero_ps();
1461             fjz2             = _mm_setzero_ps();
1462             fjx3             = _mm_setzero_ps();
1463             fjy3             = _mm_setzero_ps();
1464             fjz3             = _mm_setzero_ps();
1465
1466             /**************************
1467              * CALCULATE INTERACTIONS *
1468              **************************/
1469
1470             r00              = _mm_mul_ps(rsq00,rinv00);
1471
1472             /* Analytical LJ-PME */
1473             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1474             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
1475             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1476             exponent         = gmx_simd_exp_r(ewcljrsq);
1477             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1478             poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
1479             /* f6A = 6 * C6grid * (1 - poly) */
1480             f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1481             /* f6B = C6grid * exponent * beta^6 */
1482             f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1483             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1484             fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
1485
1486             fscal            = fvdw;
1487
1488              /* Update vectorial force */
1489             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1490             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1491             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1492
1493             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1494             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1495             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1496
1497             /**************************
1498              * CALCULATE INTERACTIONS *
1499              **************************/
1500
1501             r11              = _mm_mul_ps(rsq11,rinv11);
1502
1503             /* EWALD ELECTROSTATICS */
1504
1505             /* Analytical PME correction */
1506             zeta2            = _mm_mul_ps(beta2,rsq11);
1507             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1508             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1509             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1510             felec            = _mm_mul_ps(qq11,felec);
1511
1512             fscal            = felec;
1513
1514              /* Update vectorial force */
1515             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1516             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1517             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1518
1519             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1520             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1521             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1522
1523             /**************************
1524              * CALCULATE INTERACTIONS *
1525              **************************/
1526
1527             r12              = _mm_mul_ps(rsq12,rinv12);
1528
1529             /* EWALD ELECTROSTATICS */
1530
1531             /* Analytical PME correction */
1532             zeta2            = _mm_mul_ps(beta2,rsq12);
1533             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1534             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1535             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1536             felec            = _mm_mul_ps(qq12,felec);
1537
1538             fscal            = felec;
1539
1540              /* Update vectorial force */
1541             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1542             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1543             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1544
1545             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1546             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1547             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1548
1549             /**************************
1550              * CALCULATE INTERACTIONS *
1551              **************************/
1552
1553             r13              = _mm_mul_ps(rsq13,rinv13);
1554
1555             /* EWALD ELECTROSTATICS */
1556
1557             /* Analytical PME correction */
1558             zeta2            = _mm_mul_ps(beta2,rsq13);
1559             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
1560             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1561             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1562             felec            = _mm_mul_ps(qq13,felec);
1563
1564             fscal            = felec;
1565
1566              /* Update vectorial force */
1567             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1568             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1569             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1570
1571             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1572             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1573             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1574
1575             /**************************
1576              * CALCULATE INTERACTIONS *
1577              **************************/
1578
1579             r21              = _mm_mul_ps(rsq21,rinv21);
1580
1581             /* EWALD ELECTROSTATICS */
1582
1583             /* Analytical PME correction */
1584             zeta2            = _mm_mul_ps(beta2,rsq21);
1585             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1586             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1587             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1588             felec            = _mm_mul_ps(qq21,felec);
1589
1590             fscal            = felec;
1591
1592              /* Update vectorial force */
1593             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1594             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1595             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1596
1597             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1598             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1599             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1600
1601             /**************************
1602              * CALCULATE INTERACTIONS *
1603              **************************/
1604
1605             r22              = _mm_mul_ps(rsq22,rinv22);
1606
1607             /* EWALD ELECTROSTATICS */
1608
1609             /* Analytical PME correction */
1610             zeta2            = _mm_mul_ps(beta2,rsq22);
1611             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1612             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1613             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1614             felec            = _mm_mul_ps(qq22,felec);
1615
1616             fscal            = felec;
1617
1618              /* Update vectorial force */
1619             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1620             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1621             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1622
1623             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1624             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1625             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1626
1627             /**************************
1628              * CALCULATE INTERACTIONS *
1629              **************************/
1630
1631             r23              = _mm_mul_ps(rsq23,rinv23);
1632
1633             /* EWALD ELECTROSTATICS */
1634
1635             /* Analytical PME correction */
1636             zeta2            = _mm_mul_ps(beta2,rsq23);
1637             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
1638             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1639             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1640             felec            = _mm_mul_ps(qq23,felec);
1641
1642             fscal            = felec;
1643
1644              /* Update vectorial force */
1645             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1646             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1647             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1648
1649             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1650             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1651             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1652
1653             /**************************
1654              * CALCULATE INTERACTIONS *
1655              **************************/
1656
1657             r31              = _mm_mul_ps(rsq31,rinv31);
1658
1659             /* EWALD ELECTROSTATICS */
1660
1661             /* Analytical PME correction */
1662             zeta2            = _mm_mul_ps(beta2,rsq31);
1663             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
1664             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1665             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1666             felec            = _mm_mul_ps(qq31,felec);
1667
1668             fscal            = felec;
1669
1670              /* Update vectorial force */
1671             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1672             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1673             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1674
1675             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1676             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1677             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1678
1679             /**************************
1680              * CALCULATE INTERACTIONS *
1681              **************************/
1682
1683             r32              = _mm_mul_ps(rsq32,rinv32);
1684
1685             /* EWALD ELECTROSTATICS */
1686
1687             /* Analytical PME correction */
1688             zeta2            = _mm_mul_ps(beta2,rsq32);
1689             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
1690             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1691             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1692             felec            = _mm_mul_ps(qq32,felec);
1693
1694             fscal            = felec;
1695
1696              /* Update vectorial force */
1697             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1698             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1699             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1700
1701             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1702             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1703             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1704
1705             /**************************
1706              * CALCULATE INTERACTIONS *
1707              **************************/
1708
1709             r33              = _mm_mul_ps(rsq33,rinv33);
1710
1711             /* EWALD ELECTROSTATICS */
1712
1713             /* Analytical PME correction */
1714             zeta2            = _mm_mul_ps(beta2,rsq33);
1715             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
1716             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1717             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1718             felec            = _mm_mul_ps(qq33,felec);
1719
1720             fscal            = felec;
1721
1722              /* Update vectorial force */
1723             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1724             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1725             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1726
1727             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1728             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1729             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1730
1731             fjptrA             = f+j_coord_offsetA;
1732             fjptrB             = f+j_coord_offsetB;
1733             fjptrC             = f+j_coord_offsetC;
1734             fjptrD             = f+j_coord_offsetD;
1735
1736             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1737                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1738                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1739
1740             /* Inner loop uses 302 flops */
1741         }
1742
1743         if(jidx<j_index_end)
1744         {
1745
1746             /* Get j neighbor index, and coordinate index */
1747             jnrlistA         = jjnr[jidx];
1748             jnrlistB         = jjnr[jidx+1];
1749             jnrlistC         = jjnr[jidx+2];
1750             jnrlistD         = jjnr[jidx+3];
1751             /* Sign of each element will be negative for non-real atoms.
1752              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1753              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1754              */
1755             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1756             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1757             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1758             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1759             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1760             j_coord_offsetA  = DIM*jnrA;
1761             j_coord_offsetB  = DIM*jnrB;
1762             j_coord_offsetC  = DIM*jnrC;
1763             j_coord_offsetD  = DIM*jnrD;
1764
1765             /* load j atom coordinates */
1766             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1767                                               x+j_coord_offsetC,x+j_coord_offsetD,
1768                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1769                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1770
1771             /* Calculate displacement vector */
1772             dx00             = _mm_sub_ps(ix0,jx0);
1773             dy00             = _mm_sub_ps(iy0,jy0);
1774             dz00             = _mm_sub_ps(iz0,jz0);
1775             dx11             = _mm_sub_ps(ix1,jx1);
1776             dy11             = _mm_sub_ps(iy1,jy1);
1777             dz11             = _mm_sub_ps(iz1,jz1);
1778             dx12             = _mm_sub_ps(ix1,jx2);
1779             dy12             = _mm_sub_ps(iy1,jy2);
1780             dz12             = _mm_sub_ps(iz1,jz2);
1781             dx13             = _mm_sub_ps(ix1,jx3);
1782             dy13             = _mm_sub_ps(iy1,jy3);
1783             dz13             = _mm_sub_ps(iz1,jz3);
1784             dx21             = _mm_sub_ps(ix2,jx1);
1785             dy21             = _mm_sub_ps(iy2,jy1);
1786             dz21             = _mm_sub_ps(iz2,jz1);
1787             dx22             = _mm_sub_ps(ix2,jx2);
1788             dy22             = _mm_sub_ps(iy2,jy2);
1789             dz22             = _mm_sub_ps(iz2,jz2);
1790             dx23             = _mm_sub_ps(ix2,jx3);
1791             dy23             = _mm_sub_ps(iy2,jy3);
1792             dz23             = _mm_sub_ps(iz2,jz3);
1793             dx31             = _mm_sub_ps(ix3,jx1);
1794             dy31             = _mm_sub_ps(iy3,jy1);
1795             dz31             = _mm_sub_ps(iz3,jz1);
1796             dx32             = _mm_sub_ps(ix3,jx2);
1797             dy32             = _mm_sub_ps(iy3,jy2);
1798             dz32             = _mm_sub_ps(iz3,jz2);
1799             dx33             = _mm_sub_ps(ix3,jx3);
1800             dy33             = _mm_sub_ps(iy3,jy3);
1801             dz33             = _mm_sub_ps(iz3,jz3);
1802
1803             /* Calculate squared distance and things based on it */
1804             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1805             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1806             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1807             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1808             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1809             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1810             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1811             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1812             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1813             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1814
1815             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1816             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1817             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1818             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1819             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1820             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1821             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1822             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1823             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1824             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1825
1826             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1827             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1828             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1829             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1830             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1831             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1832             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1833             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1834             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1835             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1836
1837             fjx0             = _mm_setzero_ps();
1838             fjy0             = _mm_setzero_ps();
1839             fjz0             = _mm_setzero_ps();
1840             fjx1             = _mm_setzero_ps();
1841             fjy1             = _mm_setzero_ps();
1842             fjz1             = _mm_setzero_ps();
1843             fjx2             = _mm_setzero_ps();
1844             fjy2             = _mm_setzero_ps();
1845             fjz2             = _mm_setzero_ps();
1846             fjx3             = _mm_setzero_ps();
1847             fjy3             = _mm_setzero_ps();
1848             fjz3             = _mm_setzero_ps();
1849
1850             /**************************
1851              * CALCULATE INTERACTIONS *
1852              **************************/
1853
1854             r00              = _mm_mul_ps(rsq00,rinv00);
1855             r00              = _mm_andnot_ps(dummy_mask,r00);
1856
1857             /* Analytical LJ-PME */
1858             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1859             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
1860             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1861             exponent         = gmx_simd_exp_r(ewcljrsq);
1862             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1863             poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
1864             /* f6A = 6 * C6grid * (1 - poly) */
1865             f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1866             /* f6B = C6grid * exponent * beta^6 */
1867             f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1868             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1869             fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
1870
1871             fscal            = fvdw;
1872
1873             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1874
1875              /* Update vectorial force */
1876             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1877             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1878             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1879
1880             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1881             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1882             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1883
1884             /**************************
1885              * CALCULATE INTERACTIONS *
1886              **************************/
1887
1888             r11              = _mm_mul_ps(rsq11,rinv11);
1889             r11              = _mm_andnot_ps(dummy_mask,r11);
1890
1891             /* EWALD ELECTROSTATICS */
1892
1893             /* Analytical PME correction */
1894             zeta2            = _mm_mul_ps(beta2,rsq11);
1895             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1896             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1897             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1898             felec            = _mm_mul_ps(qq11,felec);
1899
1900             fscal            = felec;
1901
1902             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1903
1904              /* Update vectorial force */
1905             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1906             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1907             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1908
1909             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1910             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1911             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1912
1913             /**************************
1914              * CALCULATE INTERACTIONS *
1915              **************************/
1916
1917             r12              = _mm_mul_ps(rsq12,rinv12);
1918             r12              = _mm_andnot_ps(dummy_mask,r12);
1919
1920             /* EWALD ELECTROSTATICS */
1921
1922             /* Analytical PME correction */
1923             zeta2            = _mm_mul_ps(beta2,rsq12);
1924             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1925             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1926             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1927             felec            = _mm_mul_ps(qq12,felec);
1928
1929             fscal            = felec;
1930
1931             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1932
1933              /* Update vectorial force */
1934             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1935             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1936             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1937
1938             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1939             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1940             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1941
1942             /**************************
1943              * CALCULATE INTERACTIONS *
1944              **************************/
1945
1946             r13              = _mm_mul_ps(rsq13,rinv13);
1947             r13              = _mm_andnot_ps(dummy_mask,r13);
1948
1949             /* EWALD ELECTROSTATICS */
1950
1951             /* Analytical PME correction */
1952             zeta2            = _mm_mul_ps(beta2,rsq13);
1953             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
1954             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1955             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1956             felec            = _mm_mul_ps(qq13,felec);
1957
1958             fscal            = felec;
1959
1960             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1961
1962              /* Update vectorial force */
1963             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1964             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1965             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1966
1967             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1968             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1969             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1970
1971             /**************************
1972              * CALCULATE INTERACTIONS *
1973              **************************/
1974
1975             r21              = _mm_mul_ps(rsq21,rinv21);
1976             r21              = _mm_andnot_ps(dummy_mask,r21);
1977
1978             /* EWALD ELECTROSTATICS */
1979
1980             /* Analytical PME correction */
1981             zeta2            = _mm_mul_ps(beta2,rsq21);
1982             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1983             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1984             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1985             felec            = _mm_mul_ps(qq21,felec);
1986
1987             fscal            = felec;
1988
1989             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1990
1991              /* Update vectorial force */
1992             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1993             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1994             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1995
1996             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1997             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1998             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1999
2000             /**************************
2001              * CALCULATE INTERACTIONS *
2002              **************************/
2003
2004             r22              = _mm_mul_ps(rsq22,rinv22);
2005             r22              = _mm_andnot_ps(dummy_mask,r22);
2006
2007             /* EWALD ELECTROSTATICS */
2008
2009             /* Analytical PME correction */
2010             zeta2            = _mm_mul_ps(beta2,rsq22);
2011             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
2012             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2013             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2014             felec            = _mm_mul_ps(qq22,felec);
2015
2016             fscal            = felec;
2017
2018             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2019
2020              /* Update vectorial force */
2021             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2022             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2023             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2024
2025             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2026             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2027             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2028
2029             /**************************
2030              * CALCULATE INTERACTIONS *
2031              **************************/
2032
2033             r23              = _mm_mul_ps(rsq23,rinv23);
2034             r23              = _mm_andnot_ps(dummy_mask,r23);
2035
2036             /* EWALD ELECTROSTATICS */
2037
2038             /* Analytical PME correction */
2039             zeta2            = _mm_mul_ps(beta2,rsq23);
2040             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
2041             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2042             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2043             felec            = _mm_mul_ps(qq23,felec);
2044
2045             fscal            = felec;
2046
2047             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2048
2049              /* Update vectorial force */
2050             fix2             = _mm_macc_ps(dx23,fscal,fix2);
2051             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
2052             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
2053
2054             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
2055             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
2056             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
2057
2058             /**************************
2059              * CALCULATE INTERACTIONS *
2060              **************************/
2061
2062             r31              = _mm_mul_ps(rsq31,rinv31);
2063             r31              = _mm_andnot_ps(dummy_mask,r31);
2064
2065             /* EWALD ELECTROSTATICS */
2066
2067             /* Analytical PME correction */
2068             zeta2            = _mm_mul_ps(beta2,rsq31);
2069             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
2070             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2071             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2072             felec            = _mm_mul_ps(qq31,felec);
2073
2074             fscal            = felec;
2075
2076             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2077
2078              /* Update vectorial force */
2079             fix3             = _mm_macc_ps(dx31,fscal,fix3);
2080             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
2081             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
2082
2083             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
2084             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
2085             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
2086
2087             /**************************
2088              * CALCULATE INTERACTIONS *
2089              **************************/
2090
2091             r32              = _mm_mul_ps(rsq32,rinv32);
2092             r32              = _mm_andnot_ps(dummy_mask,r32);
2093
2094             /* EWALD ELECTROSTATICS */
2095
2096             /* Analytical PME correction */
2097             zeta2            = _mm_mul_ps(beta2,rsq32);
2098             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
2099             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2100             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2101             felec            = _mm_mul_ps(qq32,felec);
2102
2103             fscal            = felec;
2104
2105             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2106
2107              /* Update vectorial force */
2108             fix3             = _mm_macc_ps(dx32,fscal,fix3);
2109             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
2110             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
2111
2112             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
2113             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
2114             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
2115
2116             /**************************
2117              * CALCULATE INTERACTIONS *
2118              **************************/
2119
2120             r33              = _mm_mul_ps(rsq33,rinv33);
2121             r33              = _mm_andnot_ps(dummy_mask,r33);
2122
2123             /* EWALD ELECTROSTATICS */
2124
2125             /* Analytical PME correction */
2126             zeta2            = _mm_mul_ps(beta2,rsq33);
2127             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
2128             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2129             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2130             felec            = _mm_mul_ps(qq33,felec);
2131
2132             fscal            = felec;
2133
2134             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2135
2136              /* Update vectorial force */
2137             fix3             = _mm_macc_ps(dx33,fscal,fix3);
2138             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
2139             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
2140
2141             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
2142             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
2143             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
2144
2145             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2146             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2147             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2148             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2149
2150             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2151                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2152                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2153
2154             /* Inner loop uses 312 flops */
2155         }
2156
2157         /* End of innermost loop */
2158
2159         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2160                                               f+i_coord_offset,fshift+i_shift_offset);
2161
2162         /* Increment number of inner iterations */
2163         inneriter                  += j_index_end - j_index_start;
2164
2165         /* Outer loop uses 24 flops */
2166     }
2167
2168     /* Increment number of outer iterations */
2169     outeriter        += nri;
2170
2171     /* Update outer/inner flops */
2172
2173     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*312);
2174 }