Introduce gmxpre.h for truly global definitions
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEw_VdwLJ_GeomW4W4_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
50 #include "kernelutil_x86_avx_128_fma_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_128_fma_single
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_128_fma_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset0;
86     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     int              vdwioffset1;
88     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     int              vdwioffset2;
90     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwioffset3;
92     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
94     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
96     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
97     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
98     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
99     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
100     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
101     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
102     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
105     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
108     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
109     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
110     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
111     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
112     real             *charge;
113     int              nvdwtype;
114     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115     int              *vdwtype;
116     real             *vdwparam;
117     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
118     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
119     __m128i          ewitab;
120     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
121     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
122     real             *ewtab;
123     __m128           dummy_mask,cutoff_mask;
124     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
125     __m128           one     = _mm_set1_ps(1.0);
126     __m128           two     = _mm_set1_ps(2.0);
127     x                = xx[0];
128     f                = ff[0];
129
130     nri              = nlist->nri;
131     iinr             = nlist->iinr;
132     jindex           = nlist->jindex;
133     jjnr             = nlist->jjnr;
134     shiftidx         = nlist->shift;
135     gid              = nlist->gid;
136     shiftvec         = fr->shift_vec[0];
137     fshift           = fr->fshift[0];
138     facel            = _mm_set1_ps(fr->epsfac);
139     charge           = mdatoms->chargeA;
140     nvdwtype         = fr->ntype;
141     vdwparam         = fr->nbfp;
142     vdwtype          = mdatoms->typeA;
143
144     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
145     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
146     beta2            = _mm_mul_ps(beta,beta);
147     beta3            = _mm_mul_ps(beta,beta2);
148     ewtab            = fr->ic->tabq_coul_FDV0;
149     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
150     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
151
152     /* Setup water-specific parameters */
153     inr              = nlist->iinr[0];
154     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
155     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
156     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
157     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
158
159     jq1              = _mm_set1_ps(charge[inr+1]);
160     jq2              = _mm_set1_ps(charge[inr+2]);
161     jq3              = _mm_set1_ps(charge[inr+3]);
162     vdwjidx0A        = 2*vdwtype[inr+0];
163     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
164     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
165     qq11             = _mm_mul_ps(iq1,jq1);
166     qq12             = _mm_mul_ps(iq1,jq2);
167     qq13             = _mm_mul_ps(iq1,jq3);
168     qq21             = _mm_mul_ps(iq2,jq1);
169     qq22             = _mm_mul_ps(iq2,jq2);
170     qq23             = _mm_mul_ps(iq2,jq3);
171     qq31             = _mm_mul_ps(iq3,jq1);
172     qq32             = _mm_mul_ps(iq3,jq2);
173     qq33             = _mm_mul_ps(iq3,jq3);
174
175     /* Avoid stupid compiler warnings */
176     jnrA = jnrB = jnrC = jnrD = 0;
177     j_coord_offsetA = 0;
178     j_coord_offsetB = 0;
179     j_coord_offsetC = 0;
180     j_coord_offsetD = 0;
181
182     outeriter        = 0;
183     inneriter        = 0;
184
185     for(iidx=0;iidx<4*DIM;iidx++)
186     {
187         scratch[iidx] = 0.0;
188     }
189
190     /* Start outer loop over neighborlists */
191     for(iidx=0; iidx<nri; iidx++)
192     {
193         /* Load shift vector for this list */
194         i_shift_offset   = DIM*shiftidx[iidx];
195
196         /* Load limits for loop over neighbors */
197         j_index_start    = jindex[iidx];
198         j_index_end      = jindex[iidx+1];
199
200         /* Get outer coordinate index */
201         inr              = iinr[iidx];
202         i_coord_offset   = DIM*inr;
203
204         /* Load i particle coords and add shift vector */
205         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
206                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
207
208         fix0             = _mm_setzero_ps();
209         fiy0             = _mm_setzero_ps();
210         fiz0             = _mm_setzero_ps();
211         fix1             = _mm_setzero_ps();
212         fiy1             = _mm_setzero_ps();
213         fiz1             = _mm_setzero_ps();
214         fix2             = _mm_setzero_ps();
215         fiy2             = _mm_setzero_ps();
216         fiz2             = _mm_setzero_ps();
217         fix3             = _mm_setzero_ps();
218         fiy3             = _mm_setzero_ps();
219         fiz3             = _mm_setzero_ps();
220
221         /* Reset potential sums */
222         velecsum         = _mm_setzero_ps();
223         vvdwsum          = _mm_setzero_ps();
224
225         /* Start inner kernel loop */
226         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
227         {
228
229             /* Get j neighbor index, and coordinate index */
230             jnrA             = jjnr[jidx];
231             jnrB             = jjnr[jidx+1];
232             jnrC             = jjnr[jidx+2];
233             jnrD             = jjnr[jidx+3];
234             j_coord_offsetA  = DIM*jnrA;
235             j_coord_offsetB  = DIM*jnrB;
236             j_coord_offsetC  = DIM*jnrC;
237             j_coord_offsetD  = DIM*jnrD;
238
239             /* load j atom coordinates */
240             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
241                                               x+j_coord_offsetC,x+j_coord_offsetD,
242                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
243                                               &jy2,&jz2,&jx3,&jy3,&jz3);
244
245             /* Calculate displacement vector */
246             dx00             = _mm_sub_ps(ix0,jx0);
247             dy00             = _mm_sub_ps(iy0,jy0);
248             dz00             = _mm_sub_ps(iz0,jz0);
249             dx11             = _mm_sub_ps(ix1,jx1);
250             dy11             = _mm_sub_ps(iy1,jy1);
251             dz11             = _mm_sub_ps(iz1,jz1);
252             dx12             = _mm_sub_ps(ix1,jx2);
253             dy12             = _mm_sub_ps(iy1,jy2);
254             dz12             = _mm_sub_ps(iz1,jz2);
255             dx13             = _mm_sub_ps(ix1,jx3);
256             dy13             = _mm_sub_ps(iy1,jy3);
257             dz13             = _mm_sub_ps(iz1,jz3);
258             dx21             = _mm_sub_ps(ix2,jx1);
259             dy21             = _mm_sub_ps(iy2,jy1);
260             dz21             = _mm_sub_ps(iz2,jz1);
261             dx22             = _mm_sub_ps(ix2,jx2);
262             dy22             = _mm_sub_ps(iy2,jy2);
263             dz22             = _mm_sub_ps(iz2,jz2);
264             dx23             = _mm_sub_ps(ix2,jx3);
265             dy23             = _mm_sub_ps(iy2,jy3);
266             dz23             = _mm_sub_ps(iz2,jz3);
267             dx31             = _mm_sub_ps(ix3,jx1);
268             dy31             = _mm_sub_ps(iy3,jy1);
269             dz31             = _mm_sub_ps(iz3,jz1);
270             dx32             = _mm_sub_ps(ix3,jx2);
271             dy32             = _mm_sub_ps(iy3,jy2);
272             dz32             = _mm_sub_ps(iz3,jz2);
273             dx33             = _mm_sub_ps(ix3,jx3);
274             dy33             = _mm_sub_ps(iy3,jy3);
275             dz33             = _mm_sub_ps(iz3,jz3);
276
277             /* Calculate squared distance and things based on it */
278             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
279             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
280             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
281             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
282             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
283             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
284             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
285             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
286             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
287             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
288
289             rinv11           = gmx_mm_invsqrt_ps(rsq11);
290             rinv12           = gmx_mm_invsqrt_ps(rsq12);
291             rinv13           = gmx_mm_invsqrt_ps(rsq13);
292             rinv21           = gmx_mm_invsqrt_ps(rsq21);
293             rinv22           = gmx_mm_invsqrt_ps(rsq22);
294             rinv23           = gmx_mm_invsqrt_ps(rsq23);
295             rinv31           = gmx_mm_invsqrt_ps(rsq31);
296             rinv32           = gmx_mm_invsqrt_ps(rsq32);
297             rinv33           = gmx_mm_invsqrt_ps(rsq33);
298
299             rinvsq00         = gmx_mm_inv_ps(rsq00);
300             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
301             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
302             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
303             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
304             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
305             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
306             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
307             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
308             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
309
310             fjx0             = _mm_setzero_ps();
311             fjy0             = _mm_setzero_ps();
312             fjz0             = _mm_setzero_ps();
313             fjx1             = _mm_setzero_ps();
314             fjy1             = _mm_setzero_ps();
315             fjz1             = _mm_setzero_ps();
316             fjx2             = _mm_setzero_ps();
317             fjy2             = _mm_setzero_ps();
318             fjz2             = _mm_setzero_ps();
319             fjx3             = _mm_setzero_ps();
320             fjy3             = _mm_setzero_ps();
321             fjz3             = _mm_setzero_ps();
322
323             /**************************
324              * CALCULATE INTERACTIONS *
325              **************************/
326
327             /* LENNARD-JONES DISPERSION/REPULSION */
328
329             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
330             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
331             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
332             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
333             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
334
335             /* Update potential sum for this i atom from the interaction with this j atom. */
336             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
337
338             fscal            = fvdw;
339
340              /* Update vectorial force */
341             fix0             = _mm_macc_ps(dx00,fscal,fix0);
342             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
343             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
344
345             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
346             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
347             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
348
349             /**************************
350              * CALCULATE INTERACTIONS *
351              **************************/
352
353             r11              = _mm_mul_ps(rsq11,rinv11);
354
355             /* EWALD ELECTROSTATICS */
356
357             /* Analytical PME correction */
358             zeta2            = _mm_mul_ps(beta2,rsq11);
359             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
360             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
361             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
362             felec            = _mm_mul_ps(qq11,felec);
363             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
364             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
365             velec            = _mm_mul_ps(qq11,velec);
366
367             /* Update potential sum for this i atom from the interaction with this j atom. */
368             velecsum         = _mm_add_ps(velecsum,velec);
369
370             fscal            = felec;
371
372              /* Update vectorial force */
373             fix1             = _mm_macc_ps(dx11,fscal,fix1);
374             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
375             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
376
377             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
378             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
379             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
380
381             /**************************
382              * CALCULATE INTERACTIONS *
383              **************************/
384
385             r12              = _mm_mul_ps(rsq12,rinv12);
386
387             /* EWALD ELECTROSTATICS */
388
389             /* Analytical PME correction */
390             zeta2            = _mm_mul_ps(beta2,rsq12);
391             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
392             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
393             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
394             felec            = _mm_mul_ps(qq12,felec);
395             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
396             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
397             velec            = _mm_mul_ps(qq12,velec);
398
399             /* Update potential sum for this i atom from the interaction with this j atom. */
400             velecsum         = _mm_add_ps(velecsum,velec);
401
402             fscal            = felec;
403
404              /* Update vectorial force */
405             fix1             = _mm_macc_ps(dx12,fscal,fix1);
406             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
407             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
408
409             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
410             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
411             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
412
413             /**************************
414              * CALCULATE INTERACTIONS *
415              **************************/
416
417             r13              = _mm_mul_ps(rsq13,rinv13);
418
419             /* EWALD ELECTROSTATICS */
420
421             /* Analytical PME correction */
422             zeta2            = _mm_mul_ps(beta2,rsq13);
423             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
424             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
425             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
426             felec            = _mm_mul_ps(qq13,felec);
427             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
428             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv13);
429             velec            = _mm_mul_ps(qq13,velec);
430
431             /* Update potential sum for this i atom from the interaction with this j atom. */
432             velecsum         = _mm_add_ps(velecsum,velec);
433
434             fscal            = felec;
435
436              /* Update vectorial force */
437             fix1             = _mm_macc_ps(dx13,fscal,fix1);
438             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
439             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
440
441             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
442             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
443             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
444
445             /**************************
446              * CALCULATE INTERACTIONS *
447              **************************/
448
449             r21              = _mm_mul_ps(rsq21,rinv21);
450
451             /* EWALD ELECTROSTATICS */
452
453             /* Analytical PME correction */
454             zeta2            = _mm_mul_ps(beta2,rsq21);
455             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
456             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
457             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
458             felec            = _mm_mul_ps(qq21,felec);
459             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
460             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
461             velec            = _mm_mul_ps(qq21,velec);
462
463             /* Update potential sum for this i atom from the interaction with this j atom. */
464             velecsum         = _mm_add_ps(velecsum,velec);
465
466             fscal            = felec;
467
468              /* Update vectorial force */
469             fix2             = _mm_macc_ps(dx21,fscal,fix2);
470             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
471             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
472
473             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
474             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
475             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
476
477             /**************************
478              * CALCULATE INTERACTIONS *
479              **************************/
480
481             r22              = _mm_mul_ps(rsq22,rinv22);
482
483             /* EWALD ELECTROSTATICS */
484
485             /* Analytical PME correction */
486             zeta2            = _mm_mul_ps(beta2,rsq22);
487             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
488             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
489             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
490             felec            = _mm_mul_ps(qq22,felec);
491             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
492             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
493             velec            = _mm_mul_ps(qq22,velec);
494
495             /* Update potential sum for this i atom from the interaction with this j atom. */
496             velecsum         = _mm_add_ps(velecsum,velec);
497
498             fscal            = felec;
499
500              /* Update vectorial force */
501             fix2             = _mm_macc_ps(dx22,fscal,fix2);
502             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
503             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
504
505             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
506             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
507             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
508
509             /**************************
510              * CALCULATE INTERACTIONS *
511              **************************/
512
513             r23              = _mm_mul_ps(rsq23,rinv23);
514
515             /* EWALD ELECTROSTATICS */
516
517             /* Analytical PME correction */
518             zeta2            = _mm_mul_ps(beta2,rsq23);
519             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
520             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
521             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
522             felec            = _mm_mul_ps(qq23,felec);
523             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
524             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv23);
525             velec            = _mm_mul_ps(qq23,velec);
526
527             /* Update potential sum for this i atom from the interaction with this j atom. */
528             velecsum         = _mm_add_ps(velecsum,velec);
529
530             fscal            = felec;
531
532              /* Update vectorial force */
533             fix2             = _mm_macc_ps(dx23,fscal,fix2);
534             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
535             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
536
537             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
538             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
539             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
540
541             /**************************
542              * CALCULATE INTERACTIONS *
543              **************************/
544
545             r31              = _mm_mul_ps(rsq31,rinv31);
546
547             /* EWALD ELECTROSTATICS */
548
549             /* Analytical PME correction */
550             zeta2            = _mm_mul_ps(beta2,rsq31);
551             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
552             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
553             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
554             felec            = _mm_mul_ps(qq31,felec);
555             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
556             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv31);
557             velec            = _mm_mul_ps(qq31,velec);
558
559             /* Update potential sum for this i atom from the interaction with this j atom. */
560             velecsum         = _mm_add_ps(velecsum,velec);
561
562             fscal            = felec;
563
564              /* Update vectorial force */
565             fix3             = _mm_macc_ps(dx31,fscal,fix3);
566             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
567             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
568
569             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
570             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
571             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
572
573             /**************************
574              * CALCULATE INTERACTIONS *
575              **************************/
576
577             r32              = _mm_mul_ps(rsq32,rinv32);
578
579             /* EWALD ELECTROSTATICS */
580
581             /* Analytical PME correction */
582             zeta2            = _mm_mul_ps(beta2,rsq32);
583             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
584             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
585             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
586             felec            = _mm_mul_ps(qq32,felec);
587             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
588             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv32);
589             velec            = _mm_mul_ps(qq32,velec);
590
591             /* Update potential sum for this i atom from the interaction with this j atom. */
592             velecsum         = _mm_add_ps(velecsum,velec);
593
594             fscal            = felec;
595
596              /* Update vectorial force */
597             fix3             = _mm_macc_ps(dx32,fscal,fix3);
598             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
599             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
600
601             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
602             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
603             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
604
605             /**************************
606              * CALCULATE INTERACTIONS *
607              **************************/
608
609             r33              = _mm_mul_ps(rsq33,rinv33);
610
611             /* EWALD ELECTROSTATICS */
612
613             /* Analytical PME correction */
614             zeta2            = _mm_mul_ps(beta2,rsq33);
615             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
616             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
617             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
618             felec            = _mm_mul_ps(qq33,felec);
619             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
620             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv33);
621             velec            = _mm_mul_ps(qq33,velec);
622
623             /* Update potential sum for this i atom from the interaction with this j atom. */
624             velecsum         = _mm_add_ps(velecsum,velec);
625
626             fscal            = felec;
627
628              /* Update vectorial force */
629             fix3             = _mm_macc_ps(dx33,fscal,fix3);
630             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
631             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
632
633             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
634             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
635             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
636
637             fjptrA             = f+j_coord_offsetA;
638             fjptrB             = f+j_coord_offsetB;
639             fjptrC             = f+j_coord_offsetC;
640             fjptrD             = f+j_coord_offsetD;
641
642             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
643                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
644                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
645
646             /* Inner loop uses 299 flops */
647         }
648
649         if(jidx<j_index_end)
650         {
651
652             /* Get j neighbor index, and coordinate index */
653             jnrlistA         = jjnr[jidx];
654             jnrlistB         = jjnr[jidx+1];
655             jnrlistC         = jjnr[jidx+2];
656             jnrlistD         = jjnr[jidx+3];
657             /* Sign of each element will be negative for non-real atoms.
658              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
659              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
660              */
661             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
662             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
663             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
664             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
665             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
666             j_coord_offsetA  = DIM*jnrA;
667             j_coord_offsetB  = DIM*jnrB;
668             j_coord_offsetC  = DIM*jnrC;
669             j_coord_offsetD  = DIM*jnrD;
670
671             /* load j atom coordinates */
672             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
673                                               x+j_coord_offsetC,x+j_coord_offsetD,
674                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
675                                               &jy2,&jz2,&jx3,&jy3,&jz3);
676
677             /* Calculate displacement vector */
678             dx00             = _mm_sub_ps(ix0,jx0);
679             dy00             = _mm_sub_ps(iy0,jy0);
680             dz00             = _mm_sub_ps(iz0,jz0);
681             dx11             = _mm_sub_ps(ix1,jx1);
682             dy11             = _mm_sub_ps(iy1,jy1);
683             dz11             = _mm_sub_ps(iz1,jz1);
684             dx12             = _mm_sub_ps(ix1,jx2);
685             dy12             = _mm_sub_ps(iy1,jy2);
686             dz12             = _mm_sub_ps(iz1,jz2);
687             dx13             = _mm_sub_ps(ix1,jx3);
688             dy13             = _mm_sub_ps(iy1,jy3);
689             dz13             = _mm_sub_ps(iz1,jz3);
690             dx21             = _mm_sub_ps(ix2,jx1);
691             dy21             = _mm_sub_ps(iy2,jy1);
692             dz21             = _mm_sub_ps(iz2,jz1);
693             dx22             = _mm_sub_ps(ix2,jx2);
694             dy22             = _mm_sub_ps(iy2,jy2);
695             dz22             = _mm_sub_ps(iz2,jz2);
696             dx23             = _mm_sub_ps(ix2,jx3);
697             dy23             = _mm_sub_ps(iy2,jy3);
698             dz23             = _mm_sub_ps(iz2,jz3);
699             dx31             = _mm_sub_ps(ix3,jx1);
700             dy31             = _mm_sub_ps(iy3,jy1);
701             dz31             = _mm_sub_ps(iz3,jz1);
702             dx32             = _mm_sub_ps(ix3,jx2);
703             dy32             = _mm_sub_ps(iy3,jy2);
704             dz32             = _mm_sub_ps(iz3,jz2);
705             dx33             = _mm_sub_ps(ix3,jx3);
706             dy33             = _mm_sub_ps(iy3,jy3);
707             dz33             = _mm_sub_ps(iz3,jz3);
708
709             /* Calculate squared distance and things based on it */
710             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
711             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
712             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
713             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
714             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
715             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
716             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
717             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
718             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
719             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
720
721             rinv11           = gmx_mm_invsqrt_ps(rsq11);
722             rinv12           = gmx_mm_invsqrt_ps(rsq12);
723             rinv13           = gmx_mm_invsqrt_ps(rsq13);
724             rinv21           = gmx_mm_invsqrt_ps(rsq21);
725             rinv22           = gmx_mm_invsqrt_ps(rsq22);
726             rinv23           = gmx_mm_invsqrt_ps(rsq23);
727             rinv31           = gmx_mm_invsqrt_ps(rsq31);
728             rinv32           = gmx_mm_invsqrt_ps(rsq32);
729             rinv33           = gmx_mm_invsqrt_ps(rsq33);
730
731             rinvsq00         = gmx_mm_inv_ps(rsq00);
732             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
733             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
734             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
735             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
736             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
737             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
738             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
739             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
740             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
741
742             fjx0             = _mm_setzero_ps();
743             fjy0             = _mm_setzero_ps();
744             fjz0             = _mm_setzero_ps();
745             fjx1             = _mm_setzero_ps();
746             fjy1             = _mm_setzero_ps();
747             fjz1             = _mm_setzero_ps();
748             fjx2             = _mm_setzero_ps();
749             fjy2             = _mm_setzero_ps();
750             fjz2             = _mm_setzero_ps();
751             fjx3             = _mm_setzero_ps();
752             fjy3             = _mm_setzero_ps();
753             fjz3             = _mm_setzero_ps();
754
755             /**************************
756              * CALCULATE INTERACTIONS *
757              **************************/
758
759             /* LENNARD-JONES DISPERSION/REPULSION */
760
761             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
762             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
763             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
764             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
765             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
766
767             /* Update potential sum for this i atom from the interaction with this j atom. */
768             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
769             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
770
771             fscal            = fvdw;
772
773             fscal            = _mm_andnot_ps(dummy_mask,fscal);
774
775              /* Update vectorial force */
776             fix0             = _mm_macc_ps(dx00,fscal,fix0);
777             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
778             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
779
780             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
781             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
782             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
783
784             /**************************
785              * CALCULATE INTERACTIONS *
786              **************************/
787
788             r11              = _mm_mul_ps(rsq11,rinv11);
789             r11              = _mm_andnot_ps(dummy_mask,r11);
790
791             /* EWALD ELECTROSTATICS */
792
793             /* Analytical PME correction */
794             zeta2            = _mm_mul_ps(beta2,rsq11);
795             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
796             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
797             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
798             felec            = _mm_mul_ps(qq11,felec);
799             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
800             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
801             velec            = _mm_mul_ps(qq11,velec);
802
803             /* Update potential sum for this i atom from the interaction with this j atom. */
804             velec            = _mm_andnot_ps(dummy_mask,velec);
805             velecsum         = _mm_add_ps(velecsum,velec);
806
807             fscal            = felec;
808
809             fscal            = _mm_andnot_ps(dummy_mask,fscal);
810
811              /* Update vectorial force */
812             fix1             = _mm_macc_ps(dx11,fscal,fix1);
813             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
814             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
815
816             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
817             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
818             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
819
820             /**************************
821              * CALCULATE INTERACTIONS *
822              **************************/
823
824             r12              = _mm_mul_ps(rsq12,rinv12);
825             r12              = _mm_andnot_ps(dummy_mask,r12);
826
827             /* EWALD ELECTROSTATICS */
828
829             /* Analytical PME correction */
830             zeta2            = _mm_mul_ps(beta2,rsq12);
831             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
832             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
833             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
834             felec            = _mm_mul_ps(qq12,felec);
835             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
836             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
837             velec            = _mm_mul_ps(qq12,velec);
838
839             /* Update potential sum for this i atom from the interaction with this j atom. */
840             velec            = _mm_andnot_ps(dummy_mask,velec);
841             velecsum         = _mm_add_ps(velecsum,velec);
842
843             fscal            = felec;
844
845             fscal            = _mm_andnot_ps(dummy_mask,fscal);
846
847              /* Update vectorial force */
848             fix1             = _mm_macc_ps(dx12,fscal,fix1);
849             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
850             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
851
852             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
853             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
854             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
855
856             /**************************
857              * CALCULATE INTERACTIONS *
858              **************************/
859
860             r13              = _mm_mul_ps(rsq13,rinv13);
861             r13              = _mm_andnot_ps(dummy_mask,r13);
862
863             /* EWALD ELECTROSTATICS */
864
865             /* Analytical PME correction */
866             zeta2            = _mm_mul_ps(beta2,rsq13);
867             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
868             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
869             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
870             felec            = _mm_mul_ps(qq13,felec);
871             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
872             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv13);
873             velec            = _mm_mul_ps(qq13,velec);
874
875             /* Update potential sum for this i atom from the interaction with this j atom. */
876             velec            = _mm_andnot_ps(dummy_mask,velec);
877             velecsum         = _mm_add_ps(velecsum,velec);
878
879             fscal            = felec;
880
881             fscal            = _mm_andnot_ps(dummy_mask,fscal);
882
883              /* Update vectorial force */
884             fix1             = _mm_macc_ps(dx13,fscal,fix1);
885             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
886             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
887
888             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
889             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
890             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
891
892             /**************************
893              * CALCULATE INTERACTIONS *
894              **************************/
895
896             r21              = _mm_mul_ps(rsq21,rinv21);
897             r21              = _mm_andnot_ps(dummy_mask,r21);
898
899             /* EWALD ELECTROSTATICS */
900
901             /* Analytical PME correction */
902             zeta2            = _mm_mul_ps(beta2,rsq21);
903             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
904             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
905             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
906             felec            = _mm_mul_ps(qq21,felec);
907             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
908             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
909             velec            = _mm_mul_ps(qq21,velec);
910
911             /* Update potential sum for this i atom from the interaction with this j atom. */
912             velec            = _mm_andnot_ps(dummy_mask,velec);
913             velecsum         = _mm_add_ps(velecsum,velec);
914
915             fscal            = felec;
916
917             fscal            = _mm_andnot_ps(dummy_mask,fscal);
918
919              /* Update vectorial force */
920             fix2             = _mm_macc_ps(dx21,fscal,fix2);
921             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
922             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
923
924             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
925             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
926             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
927
928             /**************************
929              * CALCULATE INTERACTIONS *
930              **************************/
931
932             r22              = _mm_mul_ps(rsq22,rinv22);
933             r22              = _mm_andnot_ps(dummy_mask,r22);
934
935             /* EWALD ELECTROSTATICS */
936
937             /* Analytical PME correction */
938             zeta2            = _mm_mul_ps(beta2,rsq22);
939             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
940             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
941             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
942             felec            = _mm_mul_ps(qq22,felec);
943             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
944             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
945             velec            = _mm_mul_ps(qq22,velec);
946
947             /* Update potential sum for this i atom from the interaction with this j atom. */
948             velec            = _mm_andnot_ps(dummy_mask,velec);
949             velecsum         = _mm_add_ps(velecsum,velec);
950
951             fscal            = felec;
952
953             fscal            = _mm_andnot_ps(dummy_mask,fscal);
954
955              /* Update vectorial force */
956             fix2             = _mm_macc_ps(dx22,fscal,fix2);
957             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
958             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
959
960             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
961             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
962             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
963
964             /**************************
965              * CALCULATE INTERACTIONS *
966              **************************/
967
968             r23              = _mm_mul_ps(rsq23,rinv23);
969             r23              = _mm_andnot_ps(dummy_mask,r23);
970
971             /* EWALD ELECTROSTATICS */
972
973             /* Analytical PME correction */
974             zeta2            = _mm_mul_ps(beta2,rsq23);
975             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
976             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
977             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
978             felec            = _mm_mul_ps(qq23,felec);
979             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
980             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv23);
981             velec            = _mm_mul_ps(qq23,velec);
982
983             /* Update potential sum for this i atom from the interaction with this j atom. */
984             velec            = _mm_andnot_ps(dummy_mask,velec);
985             velecsum         = _mm_add_ps(velecsum,velec);
986
987             fscal            = felec;
988
989             fscal            = _mm_andnot_ps(dummy_mask,fscal);
990
991              /* Update vectorial force */
992             fix2             = _mm_macc_ps(dx23,fscal,fix2);
993             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
994             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
995
996             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
997             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
998             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
999
1000             /**************************
1001              * CALCULATE INTERACTIONS *
1002              **************************/
1003
1004             r31              = _mm_mul_ps(rsq31,rinv31);
1005             r31              = _mm_andnot_ps(dummy_mask,r31);
1006
1007             /* EWALD ELECTROSTATICS */
1008
1009             /* Analytical PME correction */
1010             zeta2            = _mm_mul_ps(beta2,rsq31);
1011             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
1012             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1013             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1014             felec            = _mm_mul_ps(qq31,felec);
1015             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1016             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv31);
1017             velec            = _mm_mul_ps(qq31,velec);
1018
1019             /* Update potential sum for this i atom from the interaction with this j atom. */
1020             velec            = _mm_andnot_ps(dummy_mask,velec);
1021             velecsum         = _mm_add_ps(velecsum,velec);
1022
1023             fscal            = felec;
1024
1025             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1026
1027              /* Update vectorial force */
1028             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1029             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1030             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1031
1032             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1033             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1034             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1035
1036             /**************************
1037              * CALCULATE INTERACTIONS *
1038              **************************/
1039
1040             r32              = _mm_mul_ps(rsq32,rinv32);
1041             r32              = _mm_andnot_ps(dummy_mask,r32);
1042
1043             /* EWALD ELECTROSTATICS */
1044
1045             /* Analytical PME correction */
1046             zeta2            = _mm_mul_ps(beta2,rsq32);
1047             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
1048             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1049             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1050             felec            = _mm_mul_ps(qq32,felec);
1051             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1052             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv32);
1053             velec            = _mm_mul_ps(qq32,velec);
1054
1055             /* Update potential sum for this i atom from the interaction with this j atom. */
1056             velec            = _mm_andnot_ps(dummy_mask,velec);
1057             velecsum         = _mm_add_ps(velecsum,velec);
1058
1059             fscal            = felec;
1060
1061             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1062
1063              /* Update vectorial force */
1064             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1065             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1066             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1067
1068             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1069             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1070             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1071
1072             /**************************
1073              * CALCULATE INTERACTIONS *
1074              **************************/
1075
1076             r33              = _mm_mul_ps(rsq33,rinv33);
1077             r33              = _mm_andnot_ps(dummy_mask,r33);
1078
1079             /* EWALD ELECTROSTATICS */
1080
1081             /* Analytical PME correction */
1082             zeta2            = _mm_mul_ps(beta2,rsq33);
1083             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
1084             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1085             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1086             felec            = _mm_mul_ps(qq33,felec);
1087             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1088             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv33);
1089             velec            = _mm_mul_ps(qq33,velec);
1090
1091             /* Update potential sum for this i atom from the interaction with this j atom. */
1092             velec            = _mm_andnot_ps(dummy_mask,velec);
1093             velecsum         = _mm_add_ps(velecsum,velec);
1094
1095             fscal            = felec;
1096
1097             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1098
1099              /* Update vectorial force */
1100             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1101             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1102             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1103
1104             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1105             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1106             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1107
1108             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1109             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1110             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1111             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1112
1113             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1114                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1115                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1116
1117             /* Inner loop uses 308 flops */
1118         }
1119
1120         /* End of innermost loop */
1121
1122         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1123                                               f+i_coord_offset,fshift+i_shift_offset);
1124
1125         ggid                        = gid[iidx];
1126         /* Update potential energies */
1127         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1128         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1129
1130         /* Increment number of inner iterations */
1131         inneriter                  += j_index_end - j_index_start;
1132
1133         /* Outer loop uses 26 flops */
1134     }
1135
1136     /* Increment number of outer iterations */
1137     outeriter        += nri;
1138
1139     /* Update outer/inner flops */
1140
1141     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*308);
1142 }
1143 /*
1144  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_128_fma_single
1145  * Electrostatics interaction: Ewald
1146  * VdW interaction:            LennardJones
1147  * Geometry:                   Water4-Water4
1148  * Calculate force/pot:        Force
1149  */
1150 void
1151 nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_128_fma_single
1152                     (t_nblist                    * gmx_restrict       nlist,
1153                      rvec                        * gmx_restrict          xx,
1154                      rvec                        * gmx_restrict          ff,
1155                      t_forcerec                  * gmx_restrict          fr,
1156                      t_mdatoms                   * gmx_restrict     mdatoms,
1157                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1158                      t_nrnb                      * gmx_restrict        nrnb)
1159 {
1160     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1161      * just 0 for non-waters.
1162      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1163      * jnr indices corresponding to data put in the four positions in the SIMD register.
1164      */
1165     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1166     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1167     int              jnrA,jnrB,jnrC,jnrD;
1168     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1169     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1170     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1171     real             rcutoff_scalar;
1172     real             *shiftvec,*fshift,*x,*f;
1173     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1174     real             scratch[4*DIM];
1175     __m128           fscal,rcutoff,rcutoff2,jidxall;
1176     int              vdwioffset0;
1177     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1178     int              vdwioffset1;
1179     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1180     int              vdwioffset2;
1181     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1182     int              vdwioffset3;
1183     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1184     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1185     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1186     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1187     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1188     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1189     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1190     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1191     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1192     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1193     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1194     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1195     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1196     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1197     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1198     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1199     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1200     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1201     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1202     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1203     real             *charge;
1204     int              nvdwtype;
1205     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1206     int              *vdwtype;
1207     real             *vdwparam;
1208     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1209     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1210     __m128i          ewitab;
1211     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1212     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1213     real             *ewtab;
1214     __m128           dummy_mask,cutoff_mask;
1215     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1216     __m128           one     = _mm_set1_ps(1.0);
1217     __m128           two     = _mm_set1_ps(2.0);
1218     x                = xx[0];
1219     f                = ff[0];
1220
1221     nri              = nlist->nri;
1222     iinr             = nlist->iinr;
1223     jindex           = nlist->jindex;
1224     jjnr             = nlist->jjnr;
1225     shiftidx         = nlist->shift;
1226     gid              = nlist->gid;
1227     shiftvec         = fr->shift_vec[0];
1228     fshift           = fr->fshift[0];
1229     facel            = _mm_set1_ps(fr->epsfac);
1230     charge           = mdatoms->chargeA;
1231     nvdwtype         = fr->ntype;
1232     vdwparam         = fr->nbfp;
1233     vdwtype          = mdatoms->typeA;
1234
1235     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1236     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1237     beta2            = _mm_mul_ps(beta,beta);
1238     beta3            = _mm_mul_ps(beta,beta2);
1239     ewtab            = fr->ic->tabq_coul_F;
1240     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1241     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1242
1243     /* Setup water-specific parameters */
1244     inr              = nlist->iinr[0];
1245     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1246     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1247     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1248     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1249
1250     jq1              = _mm_set1_ps(charge[inr+1]);
1251     jq2              = _mm_set1_ps(charge[inr+2]);
1252     jq3              = _mm_set1_ps(charge[inr+3]);
1253     vdwjidx0A        = 2*vdwtype[inr+0];
1254     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1255     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1256     qq11             = _mm_mul_ps(iq1,jq1);
1257     qq12             = _mm_mul_ps(iq1,jq2);
1258     qq13             = _mm_mul_ps(iq1,jq3);
1259     qq21             = _mm_mul_ps(iq2,jq1);
1260     qq22             = _mm_mul_ps(iq2,jq2);
1261     qq23             = _mm_mul_ps(iq2,jq3);
1262     qq31             = _mm_mul_ps(iq3,jq1);
1263     qq32             = _mm_mul_ps(iq3,jq2);
1264     qq33             = _mm_mul_ps(iq3,jq3);
1265
1266     /* Avoid stupid compiler warnings */
1267     jnrA = jnrB = jnrC = jnrD = 0;
1268     j_coord_offsetA = 0;
1269     j_coord_offsetB = 0;
1270     j_coord_offsetC = 0;
1271     j_coord_offsetD = 0;
1272
1273     outeriter        = 0;
1274     inneriter        = 0;
1275
1276     for(iidx=0;iidx<4*DIM;iidx++)
1277     {
1278         scratch[iidx] = 0.0;
1279     }
1280
1281     /* Start outer loop over neighborlists */
1282     for(iidx=0; iidx<nri; iidx++)
1283     {
1284         /* Load shift vector for this list */
1285         i_shift_offset   = DIM*shiftidx[iidx];
1286
1287         /* Load limits for loop over neighbors */
1288         j_index_start    = jindex[iidx];
1289         j_index_end      = jindex[iidx+1];
1290
1291         /* Get outer coordinate index */
1292         inr              = iinr[iidx];
1293         i_coord_offset   = DIM*inr;
1294
1295         /* Load i particle coords and add shift vector */
1296         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1297                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1298
1299         fix0             = _mm_setzero_ps();
1300         fiy0             = _mm_setzero_ps();
1301         fiz0             = _mm_setzero_ps();
1302         fix1             = _mm_setzero_ps();
1303         fiy1             = _mm_setzero_ps();
1304         fiz1             = _mm_setzero_ps();
1305         fix2             = _mm_setzero_ps();
1306         fiy2             = _mm_setzero_ps();
1307         fiz2             = _mm_setzero_ps();
1308         fix3             = _mm_setzero_ps();
1309         fiy3             = _mm_setzero_ps();
1310         fiz3             = _mm_setzero_ps();
1311
1312         /* Start inner kernel loop */
1313         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1314         {
1315
1316             /* Get j neighbor index, and coordinate index */
1317             jnrA             = jjnr[jidx];
1318             jnrB             = jjnr[jidx+1];
1319             jnrC             = jjnr[jidx+2];
1320             jnrD             = jjnr[jidx+3];
1321             j_coord_offsetA  = DIM*jnrA;
1322             j_coord_offsetB  = DIM*jnrB;
1323             j_coord_offsetC  = DIM*jnrC;
1324             j_coord_offsetD  = DIM*jnrD;
1325
1326             /* load j atom coordinates */
1327             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1328                                               x+j_coord_offsetC,x+j_coord_offsetD,
1329                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1330                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1331
1332             /* Calculate displacement vector */
1333             dx00             = _mm_sub_ps(ix0,jx0);
1334             dy00             = _mm_sub_ps(iy0,jy0);
1335             dz00             = _mm_sub_ps(iz0,jz0);
1336             dx11             = _mm_sub_ps(ix1,jx1);
1337             dy11             = _mm_sub_ps(iy1,jy1);
1338             dz11             = _mm_sub_ps(iz1,jz1);
1339             dx12             = _mm_sub_ps(ix1,jx2);
1340             dy12             = _mm_sub_ps(iy1,jy2);
1341             dz12             = _mm_sub_ps(iz1,jz2);
1342             dx13             = _mm_sub_ps(ix1,jx3);
1343             dy13             = _mm_sub_ps(iy1,jy3);
1344             dz13             = _mm_sub_ps(iz1,jz3);
1345             dx21             = _mm_sub_ps(ix2,jx1);
1346             dy21             = _mm_sub_ps(iy2,jy1);
1347             dz21             = _mm_sub_ps(iz2,jz1);
1348             dx22             = _mm_sub_ps(ix2,jx2);
1349             dy22             = _mm_sub_ps(iy2,jy2);
1350             dz22             = _mm_sub_ps(iz2,jz2);
1351             dx23             = _mm_sub_ps(ix2,jx3);
1352             dy23             = _mm_sub_ps(iy2,jy3);
1353             dz23             = _mm_sub_ps(iz2,jz3);
1354             dx31             = _mm_sub_ps(ix3,jx1);
1355             dy31             = _mm_sub_ps(iy3,jy1);
1356             dz31             = _mm_sub_ps(iz3,jz1);
1357             dx32             = _mm_sub_ps(ix3,jx2);
1358             dy32             = _mm_sub_ps(iy3,jy2);
1359             dz32             = _mm_sub_ps(iz3,jz2);
1360             dx33             = _mm_sub_ps(ix3,jx3);
1361             dy33             = _mm_sub_ps(iy3,jy3);
1362             dz33             = _mm_sub_ps(iz3,jz3);
1363
1364             /* Calculate squared distance and things based on it */
1365             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1366             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1367             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1368             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1369             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1370             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1371             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1372             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1373             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1374             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1375
1376             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1377             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1378             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1379             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1380             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1381             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1382             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1383             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1384             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1385
1386             rinvsq00         = gmx_mm_inv_ps(rsq00);
1387             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1388             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1389             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1390             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1391             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1392             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1393             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1394             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1395             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1396
1397             fjx0             = _mm_setzero_ps();
1398             fjy0             = _mm_setzero_ps();
1399             fjz0             = _mm_setzero_ps();
1400             fjx1             = _mm_setzero_ps();
1401             fjy1             = _mm_setzero_ps();
1402             fjz1             = _mm_setzero_ps();
1403             fjx2             = _mm_setzero_ps();
1404             fjy2             = _mm_setzero_ps();
1405             fjz2             = _mm_setzero_ps();
1406             fjx3             = _mm_setzero_ps();
1407             fjy3             = _mm_setzero_ps();
1408             fjz3             = _mm_setzero_ps();
1409
1410             /**************************
1411              * CALCULATE INTERACTIONS *
1412              **************************/
1413
1414             /* LENNARD-JONES DISPERSION/REPULSION */
1415
1416             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1417             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1418
1419             fscal            = fvdw;
1420
1421              /* Update vectorial force */
1422             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1423             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1424             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1425
1426             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1427             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1428             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1429
1430             /**************************
1431              * CALCULATE INTERACTIONS *
1432              **************************/
1433
1434             r11              = _mm_mul_ps(rsq11,rinv11);
1435
1436             /* EWALD ELECTROSTATICS */
1437
1438             /* Analytical PME correction */
1439             zeta2            = _mm_mul_ps(beta2,rsq11);
1440             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1441             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1442             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1443             felec            = _mm_mul_ps(qq11,felec);
1444
1445             fscal            = felec;
1446
1447              /* Update vectorial force */
1448             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1449             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1450             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1451
1452             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1453             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1454             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1455
1456             /**************************
1457              * CALCULATE INTERACTIONS *
1458              **************************/
1459
1460             r12              = _mm_mul_ps(rsq12,rinv12);
1461
1462             /* EWALD ELECTROSTATICS */
1463
1464             /* Analytical PME correction */
1465             zeta2            = _mm_mul_ps(beta2,rsq12);
1466             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1467             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1468             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1469             felec            = _mm_mul_ps(qq12,felec);
1470
1471             fscal            = felec;
1472
1473              /* Update vectorial force */
1474             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1475             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1476             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1477
1478             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1479             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1480             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1481
1482             /**************************
1483              * CALCULATE INTERACTIONS *
1484              **************************/
1485
1486             r13              = _mm_mul_ps(rsq13,rinv13);
1487
1488             /* EWALD ELECTROSTATICS */
1489
1490             /* Analytical PME correction */
1491             zeta2            = _mm_mul_ps(beta2,rsq13);
1492             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
1493             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1494             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1495             felec            = _mm_mul_ps(qq13,felec);
1496
1497             fscal            = felec;
1498
1499              /* Update vectorial force */
1500             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1501             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1502             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1503
1504             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1505             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1506             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1507
1508             /**************************
1509              * CALCULATE INTERACTIONS *
1510              **************************/
1511
1512             r21              = _mm_mul_ps(rsq21,rinv21);
1513
1514             /* EWALD ELECTROSTATICS */
1515
1516             /* Analytical PME correction */
1517             zeta2            = _mm_mul_ps(beta2,rsq21);
1518             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1519             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1520             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1521             felec            = _mm_mul_ps(qq21,felec);
1522
1523             fscal            = felec;
1524
1525              /* Update vectorial force */
1526             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1527             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1528             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1529
1530             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1531             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1532             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1533
1534             /**************************
1535              * CALCULATE INTERACTIONS *
1536              **************************/
1537
1538             r22              = _mm_mul_ps(rsq22,rinv22);
1539
1540             /* EWALD ELECTROSTATICS */
1541
1542             /* Analytical PME correction */
1543             zeta2            = _mm_mul_ps(beta2,rsq22);
1544             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1545             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1546             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1547             felec            = _mm_mul_ps(qq22,felec);
1548
1549             fscal            = felec;
1550
1551              /* Update vectorial force */
1552             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1553             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1554             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1555
1556             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1557             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1558             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1559
1560             /**************************
1561              * CALCULATE INTERACTIONS *
1562              **************************/
1563
1564             r23              = _mm_mul_ps(rsq23,rinv23);
1565
1566             /* EWALD ELECTROSTATICS */
1567
1568             /* Analytical PME correction */
1569             zeta2            = _mm_mul_ps(beta2,rsq23);
1570             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
1571             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1572             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1573             felec            = _mm_mul_ps(qq23,felec);
1574
1575             fscal            = felec;
1576
1577              /* Update vectorial force */
1578             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1579             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1580             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1581
1582             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1583             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1584             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1585
1586             /**************************
1587              * CALCULATE INTERACTIONS *
1588              **************************/
1589
1590             r31              = _mm_mul_ps(rsq31,rinv31);
1591
1592             /* EWALD ELECTROSTATICS */
1593
1594             /* Analytical PME correction */
1595             zeta2            = _mm_mul_ps(beta2,rsq31);
1596             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
1597             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1598             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1599             felec            = _mm_mul_ps(qq31,felec);
1600
1601             fscal            = felec;
1602
1603              /* Update vectorial force */
1604             fix3             = _mm_macc_ps(dx31,fscal,fix3);
1605             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
1606             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
1607
1608             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
1609             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
1610             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
1611
1612             /**************************
1613              * CALCULATE INTERACTIONS *
1614              **************************/
1615
1616             r32              = _mm_mul_ps(rsq32,rinv32);
1617
1618             /* EWALD ELECTROSTATICS */
1619
1620             /* Analytical PME correction */
1621             zeta2            = _mm_mul_ps(beta2,rsq32);
1622             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
1623             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1624             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1625             felec            = _mm_mul_ps(qq32,felec);
1626
1627             fscal            = felec;
1628
1629              /* Update vectorial force */
1630             fix3             = _mm_macc_ps(dx32,fscal,fix3);
1631             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
1632             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
1633
1634             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
1635             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
1636             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
1637
1638             /**************************
1639              * CALCULATE INTERACTIONS *
1640              **************************/
1641
1642             r33              = _mm_mul_ps(rsq33,rinv33);
1643
1644             /* EWALD ELECTROSTATICS */
1645
1646             /* Analytical PME correction */
1647             zeta2            = _mm_mul_ps(beta2,rsq33);
1648             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
1649             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1650             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1651             felec            = _mm_mul_ps(qq33,felec);
1652
1653             fscal            = felec;
1654
1655              /* Update vectorial force */
1656             fix3             = _mm_macc_ps(dx33,fscal,fix3);
1657             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
1658             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
1659
1660             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
1661             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
1662             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
1663
1664             fjptrA             = f+j_coord_offsetA;
1665             fjptrB             = f+j_coord_offsetB;
1666             fjptrC             = f+j_coord_offsetC;
1667             fjptrD             = f+j_coord_offsetD;
1668
1669             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1670                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1671                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1672
1673             /* Inner loop uses 285 flops */
1674         }
1675
1676         if(jidx<j_index_end)
1677         {
1678
1679             /* Get j neighbor index, and coordinate index */
1680             jnrlistA         = jjnr[jidx];
1681             jnrlistB         = jjnr[jidx+1];
1682             jnrlistC         = jjnr[jidx+2];
1683             jnrlistD         = jjnr[jidx+3];
1684             /* Sign of each element will be negative for non-real atoms.
1685              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1686              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1687              */
1688             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1689             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1690             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1691             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1692             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1693             j_coord_offsetA  = DIM*jnrA;
1694             j_coord_offsetB  = DIM*jnrB;
1695             j_coord_offsetC  = DIM*jnrC;
1696             j_coord_offsetD  = DIM*jnrD;
1697
1698             /* load j atom coordinates */
1699             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1700                                               x+j_coord_offsetC,x+j_coord_offsetD,
1701                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1702                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1703
1704             /* Calculate displacement vector */
1705             dx00             = _mm_sub_ps(ix0,jx0);
1706             dy00             = _mm_sub_ps(iy0,jy0);
1707             dz00             = _mm_sub_ps(iz0,jz0);
1708             dx11             = _mm_sub_ps(ix1,jx1);
1709             dy11             = _mm_sub_ps(iy1,jy1);
1710             dz11             = _mm_sub_ps(iz1,jz1);
1711             dx12             = _mm_sub_ps(ix1,jx2);
1712             dy12             = _mm_sub_ps(iy1,jy2);
1713             dz12             = _mm_sub_ps(iz1,jz2);
1714             dx13             = _mm_sub_ps(ix1,jx3);
1715             dy13             = _mm_sub_ps(iy1,jy3);
1716             dz13             = _mm_sub_ps(iz1,jz3);
1717             dx21             = _mm_sub_ps(ix2,jx1);
1718             dy21             = _mm_sub_ps(iy2,jy1);
1719             dz21             = _mm_sub_ps(iz2,jz1);
1720             dx22             = _mm_sub_ps(ix2,jx2);
1721             dy22             = _mm_sub_ps(iy2,jy2);
1722             dz22             = _mm_sub_ps(iz2,jz2);
1723             dx23             = _mm_sub_ps(ix2,jx3);
1724             dy23             = _mm_sub_ps(iy2,jy3);
1725             dz23             = _mm_sub_ps(iz2,jz3);
1726             dx31             = _mm_sub_ps(ix3,jx1);
1727             dy31             = _mm_sub_ps(iy3,jy1);
1728             dz31             = _mm_sub_ps(iz3,jz1);
1729             dx32             = _mm_sub_ps(ix3,jx2);
1730             dy32             = _mm_sub_ps(iy3,jy2);
1731             dz32             = _mm_sub_ps(iz3,jz2);
1732             dx33             = _mm_sub_ps(ix3,jx3);
1733             dy33             = _mm_sub_ps(iy3,jy3);
1734             dz33             = _mm_sub_ps(iz3,jz3);
1735
1736             /* Calculate squared distance and things based on it */
1737             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1738             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1739             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1740             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1741             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1742             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1743             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1744             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1745             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1746             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1747
1748             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1749             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1750             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1751             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1752             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1753             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1754             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1755             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1756             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1757
1758             rinvsq00         = gmx_mm_inv_ps(rsq00);
1759             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1760             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1761             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1762             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1763             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1764             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1765             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1766             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1767             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1768
1769             fjx0             = _mm_setzero_ps();
1770             fjy0             = _mm_setzero_ps();
1771             fjz0             = _mm_setzero_ps();
1772             fjx1             = _mm_setzero_ps();
1773             fjy1             = _mm_setzero_ps();
1774             fjz1             = _mm_setzero_ps();
1775             fjx2             = _mm_setzero_ps();
1776             fjy2             = _mm_setzero_ps();
1777             fjz2             = _mm_setzero_ps();
1778             fjx3             = _mm_setzero_ps();
1779             fjy3             = _mm_setzero_ps();
1780             fjz3             = _mm_setzero_ps();
1781
1782             /**************************
1783              * CALCULATE INTERACTIONS *
1784              **************************/
1785
1786             /* LENNARD-JONES DISPERSION/REPULSION */
1787
1788             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1789             fvdw             = _mm_mul_ps(_mm_msub_ps(c12_00,rinvsix,c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1790
1791             fscal            = fvdw;
1792
1793             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1794
1795              /* Update vectorial force */
1796             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1797             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1798             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1799
1800             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1801             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1802             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1803
1804             /**************************
1805              * CALCULATE INTERACTIONS *
1806              **************************/
1807
1808             r11              = _mm_mul_ps(rsq11,rinv11);
1809             r11              = _mm_andnot_ps(dummy_mask,r11);
1810
1811             /* EWALD ELECTROSTATICS */
1812
1813             /* Analytical PME correction */
1814             zeta2            = _mm_mul_ps(beta2,rsq11);
1815             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1816             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1817             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1818             felec            = _mm_mul_ps(qq11,felec);
1819
1820             fscal            = felec;
1821
1822             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1823
1824              /* Update vectorial force */
1825             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1826             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1827             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1828
1829             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1830             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1831             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1832
1833             /**************************
1834              * CALCULATE INTERACTIONS *
1835              **************************/
1836
1837             r12              = _mm_mul_ps(rsq12,rinv12);
1838             r12              = _mm_andnot_ps(dummy_mask,r12);
1839
1840             /* EWALD ELECTROSTATICS */
1841
1842             /* Analytical PME correction */
1843             zeta2            = _mm_mul_ps(beta2,rsq12);
1844             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1845             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1846             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1847             felec            = _mm_mul_ps(qq12,felec);
1848
1849             fscal            = felec;
1850
1851             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1852
1853              /* Update vectorial force */
1854             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1855             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1856             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1857
1858             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1859             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1860             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1861
1862             /**************************
1863              * CALCULATE INTERACTIONS *
1864              **************************/
1865
1866             r13              = _mm_mul_ps(rsq13,rinv13);
1867             r13              = _mm_andnot_ps(dummy_mask,r13);
1868
1869             /* EWALD ELECTROSTATICS */
1870
1871             /* Analytical PME correction */
1872             zeta2            = _mm_mul_ps(beta2,rsq13);
1873             rinv3            = _mm_mul_ps(rinvsq13,rinv13);
1874             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1875             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1876             felec            = _mm_mul_ps(qq13,felec);
1877
1878             fscal            = felec;
1879
1880             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1881
1882              /* Update vectorial force */
1883             fix1             = _mm_macc_ps(dx13,fscal,fix1);
1884             fiy1             = _mm_macc_ps(dy13,fscal,fiy1);
1885             fiz1             = _mm_macc_ps(dz13,fscal,fiz1);
1886
1887             fjx3             = _mm_macc_ps(dx13,fscal,fjx3);
1888             fjy3             = _mm_macc_ps(dy13,fscal,fjy3);
1889             fjz3             = _mm_macc_ps(dz13,fscal,fjz3);
1890
1891             /**************************
1892              * CALCULATE INTERACTIONS *
1893              **************************/
1894
1895             r21              = _mm_mul_ps(rsq21,rinv21);
1896             r21              = _mm_andnot_ps(dummy_mask,r21);
1897
1898             /* EWALD ELECTROSTATICS */
1899
1900             /* Analytical PME correction */
1901             zeta2            = _mm_mul_ps(beta2,rsq21);
1902             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1903             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1904             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1905             felec            = _mm_mul_ps(qq21,felec);
1906
1907             fscal            = felec;
1908
1909             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1910
1911              /* Update vectorial force */
1912             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1913             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1914             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1915
1916             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1917             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1918             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1919
1920             /**************************
1921              * CALCULATE INTERACTIONS *
1922              **************************/
1923
1924             r22              = _mm_mul_ps(rsq22,rinv22);
1925             r22              = _mm_andnot_ps(dummy_mask,r22);
1926
1927             /* EWALD ELECTROSTATICS */
1928
1929             /* Analytical PME correction */
1930             zeta2            = _mm_mul_ps(beta2,rsq22);
1931             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1932             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1933             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1934             felec            = _mm_mul_ps(qq22,felec);
1935
1936             fscal            = felec;
1937
1938             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1939
1940              /* Update vectorial force */
1941             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1942             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1943             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1944
1945             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1946             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1947             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1948
1949             /**************************
1950              * CALCULATE INTERACTIONS *
1951              **************************/
1952
1953             r23              = _mm_mul_ps(rsq23,rinv23);
1954             r23              = _mm_andnot_ps(dummy_mask,r23);
1955
1956             /* EWALD ELECTROSTATICS */
1957
1958             /* Analytical PME correction */
1959             zeta2            = _mm_mul_ps(beta2,rsq23);
1960             rinv3            = _mm_mul_ps(rinvsq23,rinv23);
1961             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1962             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1963             felec            = _mm_mul_ps(qq23,felec);
1964
1965             fscal            = felec;
1966
1967             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1968
1969              /* Update vectorial force */
1970             fix2             = _mm_macc_ps(dx23,fscal,fix2);
1971             fiy2             = _mm_macc_ps(dy23,fscal,fiy2);
1972             fiz2             = _mm_macc_ps(dz23,fscal,fiz2);
1973
1974             fjx3             = _mm_macc_ps(dx23,fscal,fjx3);
1975             fjy3             = _mm_macc_ps(dy23,fscal,fjy3);
1976             fjz3             = _mm_macc_ps(dz23,fscal,fjz3);
1977
1978             /**************************
1979              * CALCULATE INTERACTIONS *
1980              **************************/
1981
1982             r31              = _mm_mul_ps(rsq31,rinv31);
1983             r31              = _mm_andnot_ps(dummy_mask,r31);
1984
1985             /* EWALD ELECTROSTATICS */
1986
1987             /* Analytical PME correction */
1988             zeta2            = _mm_mul_ps(beta2,rsq31);
1989             rinv3            = _mm_mul_ps(rinvsq31,rinv31);
1990             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1991             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1992             felec            = _mm_mul_ps(qq31,felec);
1993
1994             fscal            = felec;
1995
1996             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1997
1998              /* Update vectorial force */
1999             fix3             = _mm_macc_ps(dx31,fscal,fix3);
2000             fiy3             = _mm_macc_ps(dy31,fscal,fiy3);
2001             fiz3             = _mm_macc_ps(dz31,fscal,fiz3);
2002
2003             fjx1             = _mm_macc_ps(dx31,fscal,fjx1);
2004             fjy1             = _mm_macc_ps(dy31,fscal,fjy1);
2005             fjz1             = _mm_macc_ps(dz31,fscal,fjz1);
2006
2007             /**************************
2008              * CALCULATE INTERACTIONS *
2009              **************************/
2010
2011             r32              = _mm_mul_ps(rsq32,rinv32);
2012             r32              = _mm_andnot_ps(dummy_mask,r32);
2013
2014             /* EWALD ELECTROSTATICS */
2015
2016             /* Analytical PME correction */
2017             zeta2            = _mm_mul_ps(beta2,rsq32);
2018             rinv3            = _mm_mul_ps(rinvsq32,rinv32);
2019             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2020             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2021             felec            = _mm_mul_ps(qq32,felec);
2022
2023             fscal            = felec;
2024
2025             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2026
2027              /* Update vectorial force */
2028             fix3             = _mm_macc_ps(dx32,fscal,fix3);
2029             fiy3             = _mm_macc_ps(dy32,fscal,fiy3);
2030             fiz3             = _mm_macc_ps(dz32,fscal,fiz3);
2031
2032             fjx2             = _mm_macc_ps(dx32,fscal,fjx2);
2033             fjy2             = _mm_macc_ps(dy32,fscal,fjy2);
2034             fjz2             = _mm_macc_ps(dz32,fscal,fjz2);
2035
2036             /**************************
2037              * CALCULATE INTERACTIONS *
2038              **************************/
2039
2040             r33              = _mm_mul_ps(rsq33,rinv33);
2041             r33              = _mm_andnot_ps(dummy_mask,r33);
2042
2043             /* EWALD ELECTROSTATICS */
2044
2045             /* Analytical PME correction */
2046             zeta2            = _mm_mul_ps(beta2,rsq33);
2047             rinv3            = _mm_mul_ps(rinvsq33,rinv33);
2048             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2049             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2050             felec            = _mm_mul_ps(qq33,felec);
2051
2052             fscal            = felec;
2053
2054             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2055
2056              /* Update vectorial force */
2057             fix3             = _mm_macc_ps(dx33,fscal,fix3);
2058             fiy3             = _mm_macc_ps(dy33,fscal,fiy3);
2059             fiz3             = _mm_macc_ps(dz33,fscal,fiz3);
2060
2061             fjx3             = _mm_macc_ps(dx33,fscal,fjx3);
2062             fjy3             = _mm_macc_ps(dy33,fscal,fjy3);
2063             fjz3             = _mm_macc_ps(dz33,fscal,fjz3);
2064
2065             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2066             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2067             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2068             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2069
2070             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2071                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2072                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2073
2074             /* Inner loop uses 294 flops */
2075         }
2076
2077         /* End of innermost loop */
2078
2079         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2080                                               f+i_coord_offset,fshift+i_shift_offset);
2081
2082         /* Increment number of inner iterations */
2083         inneriter                  += j_index_end - j_index_start;
2084
2085         /* Outer loop uses 24 flops */
2086     }
2087
2088     /* Increment number of outer iterations */
2089     outeriter        += nri;
2090
2091     /* Update outer/inner flops */
2092
2093     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*294);
2094 }