411c0691074f0f97496aa44b3318582812825617
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEw_VdwLJEw_GeomW3W3_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_128_fma_single
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            LJEwald
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEw_VdwLJEw_GeomW3W3_VF_avx_128_fma_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
105     real             *charge;
106     int              nvdwtype;
107     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
108     int              *vdwtype;
109     real             *vdwparam;
110     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
111     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
112     __m128           c6grid_00;
113     __m128           c6grid_01;
114     __m128           c6grid_02;
115     __m128           c6grid_10;
116     __m128           c6grid_11;
117     __m128           c6grid_12;
118     __m128           c6grid_20;
119     __m128           c6grid_21;
120     __m128           c6grid_22;
121     real             *vdwgridparam;
122     __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
123     __m128           one_half = _mm_set1_ps(0.5);
124     __m128           minus_one = _mm_set1_ps(-1.0);
125     __m128i          ewitab;
126     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
127     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
128     real             *ewtab;
129     __m128           dummy_mask,cutoff_mask;
130     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
131     __m128           one     = _mm_set1_ps(1.0);
132     __m128           two     = _mm_set1_ps(2.0);
133     x                = xx[0];
134     f                = ff[0];
135
136     nri              = nlist->nri;
137     iinr             = nlist->iinr;
138     jindex           = nlist->jindex;
139     jjnr             = nlist->jjnr;
140     shiftidx         = nlist->shift;
141     gid              = nlist->gid;
142     shiftvec         = fr->shift_vec[0];
143     fshift           = fr->fshift[0];
144     facel            = _mm_set1_ps(fr->epsfac);
145     charge           = mdatoms->chargeA;
146     nvdwtype         = fr->ntype;
147     vdwparam         = fr->nbfp;
148     vdwtype          = mdatoms->typeA;
149     vdwgridparam     = fr->ljpme_c6grid;
150     sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
151     ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
152     ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
153
154     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
155     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
156     beta2            = _mm_mul_ps(beta,beta);
157     beta3            = _mm_mul_ps(beta,beta2);
158     ewtab            = fr->ic->tabq_coul_FDV0;
159     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
160     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
161
162     /* Setup water-specific parameters */
163     inr              = nlist->iinr[0];
164     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
165     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
166     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
167     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
168
169     jq0              = _mm_set1_ps(charge[inr+0]);
170     jq1              = _mm_set1_ps(charge[inr+1]);
171     jq2              = _mm_set1_ps(charge[inr+2]);
172     vdwjidx0A        = 2*vdwtype[inr+0];
173     qq00             = _mm_mul_ps(iq0,jq0);
174     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
175     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
176     c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
177     qq01             = _mm_mul_ps(iq0,jq1);
178     qq02             = _mm_mul_ps(iq0,jq2);
179     qq10             = _mm_mul_ps(iq1,jq0);
180     qq11             = _mm_mul_ps(iq1,jq1);
181     qq12             = _mm_mul_ps(iq1,jq2);
182     qq20             = _mm_mul_ps(iq2,jq0);
183     qq21             = _mm_mul_ps(iq2,jq1);
184     qq22             = _mm_mul_ps(iq2,jq2);
185
186     /* Avoid stupid compiler warnings */
187     jnrA = jnrB = jnrC = jnrD = 0;
188     j_coord_offsetA = 0;
189     j_coord_offsetB = 0;
190     j_coord_offsetC = 0;
191     j_coord_offsetD = 0;
192
193     outeriter        = 0;
194     inneriter        = 0;
195
196     for(iidx=0;iidx<4*DIM;iidx++)
197     {
198         scratch[iidx] = 0.0;
199     }
200
201     /* Start outer loop over neighborlists */
202     for(iidx=0; iidx<nri; iidx++)
203     {
204         /* Load shift vector for this list */
205         i_shift_offset   = DIM*shiftidx[iidx];
206
207         /* Load limits for loop over neighbors */
208         j_index_start    = jindex[iidx];
209         j_index_end      = jindex[iidx+1];
210
211         /* Get outer coordinate index */
212         inr              = iinr[iidx];
213         i_coord_offset   = DIM*inr;
214
215         /* Load i particle coords and add shift vector */
216         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
217                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
218
219         fix0             = _mm_setzero_ps();
220         fiy0             = _mm_setzero_ps();
221         fiz0             = _mm_setzero_ps();
222         fix1             = _mm_setzero_ps();
223         fiy1             = _mm_setzero_ps();
224         fiz1             = _mm_setzero_ps();
225         fix2             = _mm_setzero_ps();
226         fiy2             = _mm_setzero_ps();
227         fiz2             = _mm_setzero_ps();
228
229         /* Reset potential sums */
230         velecsum         = _mm_setzero_ps();
231         vvdwsum          = _mm_setzero_ps();
232
233         /* Start inner kernel loop */
234         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
235         {
236
237             /* Get j neighbor index, and coordinate index */
238             jnrA             = jjnr[jidx];
239             jnrB             = jjnr[jidx+1];
240             jnrC             = jjnr[jidx+2];
241             jnrD             = jjnr[jidx+3];
242             j_coord_offsetA  = DIM*jnrA;
243             j_coord_offsetB  = DIM*jnrB;
244             j_coord_offsetC  = DIM*jnrC;
245             j_coord_offsetD  = DIM*jnrD;
246
247             /* load j atom coordinates */
248             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
249                                               x+j_coord_offsetC,x+j_coord_offsetD,
250                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
251
252             /* Calculate displacement vector */
253             dx00             = _mm_sub_ps(ix0,jx0);
254             dy00             = _mm_sub_ps(iy0,jy0);
255             dz00             = _mm_sub_ps(iz0,jz0);
256             dx01             = _mm_sub_ps(ix0,jx1);
257             dy01             = _mm_sub_ps(iy0,jy1);
258             dz01             = _mm_sub_ps(iz0,jz1);
259             dx02             = _mm_sub_ps(ix0,jx2);
260             dy02             = _mm_sub_ps(iy0,jy2);
261             dz02             = _mm_sub_ps(iz0,jz2);
262             dx10             = _mm_sub_ps(ix1,jx0);
263             dy10             = _mm_sub_ps(iy1,jy0);
264             dz10             = _mm_sub_ps(iz1,jz0);
265             dx11             = _mm_sub_ps(ix1,jx1);
266             dy11             = _mm_sub_ps(iy1,jy1);
267             dz11             = _mm_sub_ps(iz1,jz1);
268             dx12             = _mm_sub_ps(ix1,jx2);
269             dy12             = _mm_sub_ps(iy1,jy2);
270             dz12             = _mm_sub_ps(iz1,jz2);
271             dx20             = _mm_sub_ps(ix2,jx0);
272             dy20             = _mm_sub_ps(iy2,jy0);
273             dz20             = _mm_sub_ps(iz2,jz0);
274             dx21             = _mm_sub_ps(ix2,jx1);
275             dy21             = _mm_sub_ps(iy2,jy1);
276             dz21             = _mm_sub_ps(iz2,jz1);
277             dx22             = _mm_sub_ps(ix2,jx2);
278             dy22             = _mm_sub_ps(iy2,jy2);
279             dz22             = _mm_sub_ps(iz2,jz2);
280
281             /* Calculate squared distance and things based on it */
282             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
283             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
284             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
285             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
286             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
287             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
288             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
289             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
290             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
291
292             rinv00           = gmx_mm_invsqrt_ps(rsq00);
293             rinv01           = gmx_mm_invsqrt_ps(rsq01);
294             rinv02           = gmx_mm_invsqrt_ps(rsq02);
295             rinv10           = gmx_mm_invsqrt_ps(rsq10);
296             rinv11           = gmx_mm_invsqrt_ps(rsq11);
297             rinv12           = gmx_mm_invsqrt_ps(rsq12);
298             rinv20           = gmx_mm_invsqrt_ps(rsq20);
299             rinv21           = gmx_mm_invsqrt_ps(rsq21);
300             rinv22           = gmx_mm_invsqrt_ps(rsq22);
301
302             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
303             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
304             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
305             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
306             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
307             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
308             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
309             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
310             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
311
312             fjx0             = _mm_setzero_ps();
313             fjy0             = _mm_setzero_ps();
314             fjz0             = _mm_setzero_ps();
315             fjx1             = _mm_setzero_ps();
316             fjy1             = _mm_setzero_ps();
317             fjz1             = _mm_setzero_ps();
318             fjx2             = _mm_setzero_ps();
319             fjy2             = _mm_setzero_ps();
320             fjz2             = _mm_setzero_ps();
321
322             /**************************
323              * CALCULATE INTERACTIONS *
324              **************************/
325
326             r00              = _mm_mul_ps(rsq00,rinv00);
327
328             /* EWALD ELECTROSTATICS */
329
330             /* Analytical PME correction */
331             zeta2            = _mm_mul_ps(beta2,rsq00);
332             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
333             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
334             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
335             felec            = _mm_mul_ps(qq00,felec);
336             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
337             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
338             velec            = _mm_mul_ps(qq00,velec);
339
340             /* Analytical LJ-PME */
341             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
342             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
343             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
344             exponent         = gmx_simd_exp_r(ewcljrsq);
345             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
346             poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
347             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
348             vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
349             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
350             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
351             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
352             fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
353
354             /* Update potential sum for this i atom from the interaction with this j atom. */
355             velecsum         = _mm_add_ps(velecsum,velec);
356             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
357
358             fscal            = _mm_add_ps(felec,fvdw);
359
360              /* Update vectorial force */
361             fix0             = _mm_macc_ps(dx00,fscal,fix0);
362             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
363             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
364
365             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
366             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
367             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
368
369             /**************************
370              * CALCULATE INTERACTIONS *
371              **************************/
372
373             r01              = _mm_mul_ps(rsq01,rinv01);
374
375             /* EWALD ELECTROSTATICS */
376
377             /* Analytical PME correction */
378             zeta2            = _mm_mul_ps(beta2,rsq01);
379             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
380             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
381             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
382             felec            = _mm_mul_ps(qq01,felec);
383             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
384             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
385             velec            = _mm_mul_ps(qq01,velec);
386
387             /* Update potential sum for this i atom from the interaction with this j atom. */
388             velecsum         = _mm_add_ps(velecsum,velec);
389
390             fscal            = felec;
391
392              /* Update vectorial force */
393             fix0             = _mm_macc_ps(dx01,fscal,fix0);
394             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
395             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
396
397             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
398             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
399             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
400
401             /**************************
402              * CALCULATE INTERACTIONS *
403              **************************/
404
405             r02              = _mm_mul_ps(rsq02,rinv02);
406
407             /* EWALD ELECTROSTATICS */
408
409             /* Analytical PME correction */
410             zeta2            = _mm_mul_ps(beta2,rsq02);
411             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
412             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
413             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
414             felec            = _mm_mul_ps(qq02,felec);
415             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
416             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
417             velec            = _mm_mul_ps(qq02,velec);
418
419             /* Update potential sum for this i atom from the interaction with this j atom. */
420             velecsum         = _mm_add_ps(velecsum,velec);
421
422             fscal            = felec;
423
424              /* Update vectorial force */
425             fix0             = _mm_macc_ps(dx02,fscal,fix0);
426             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
427             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
428
429             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
430             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
431             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
432
433             /**************************
434              * CALCULATE INTERACTIONS *
435              **************************/
436
437             r10              = _mm_mul_ps(rsq10,rinv10);
438
439             /* EWALD ELECTROSTATICS */
440
441             /* Analytical PME correction */
442             zeta2            = _mm_mul_ps(beta2,rsq10);
443             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
444             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
445             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
446             felec            = _mm_mul_ps(qq10,felec);
447             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
448             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
449             velec            = _mm_mul_ps(qq10,velec);
450
451             /* Update potential sum for this i atom from the interaction with this j atom. */
452             velecsum         = _mm_add_ps(velecsum,velec);
453
454             fscal            = felec;
455
456              /* Update vectorial force */
457             fix1             = _mm_macc_ps(dx10,fscal,fix1);
458             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
459             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
460
461             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
462             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
463             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
464
465             /**************************
466              * CALCULATE INTERACTIONS *
467              **************************/
468
469             r11              = _mm_mul_ps(rsq11,rinv11);
470
471             /* EWALD ELECTROSTATICS */
472
473             /* Analytical PME correction */
474             zeta2            = _mm_mul_ps(beta2,rsq11);
475             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
476             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
477             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
478             felec            = _mm_mul_ps(qq11,felec);
479             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
480             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
481             velec            = _mm_mul_ps(qq11,velec);
482
483             /* Update potential sum for this i atom from the interaction with this j atom. */
484             velecsum         = _mm_add_ps(velecsum,velec);
485
486             fscal            = felec;
487
488              /* Update vectorial force */
489             fix1             = _mm_macc_ps(dx11,fscal,fix1);
490             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
491             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
492
493             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
494             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
495             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
496
497             /**************************
498              * CALCULATE INTERACTIONS *
499              **************************/
500
501             r12              = _mm_mul_ps(rsq12,rinv12);
502
503             /* EWALD ELECTROSTATICS */
504
505             /* Analytical PME correction */
506             zeta2            = _mm_mul_ps(beta2,rsq12);
507             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
508             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
509             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
510             felec            = _mm_mul_ps(qq12,felec);
511             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
512             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
513             velec            = _mm_mul_ps(qq12,velec);
514
515             /* Update potential sum for this i atom from the interaction with this j atom. */
516             velecsum         = _mm_add_ps(velecsum,velec);
517
518             fscal            = felec;
519
520              /* Update vectorial force */
521             fix1             = _mm_macc_ps(dx12,fscal,fix1);
522             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
523             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
524
525             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
526             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
527             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
528
529             /**************************
530              * CALCULATE INTERACTIONS *
531              **************************/
532
533             r20              = _mm_mul_ps(rsq20,rinv20);
534
535             /* EWALD ELECTROSTATICS */
536
537             /* Analytical PME correction */
538             zeta2            = _mm_mul_ps(beta2,rsq20);
539             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
540             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
541             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
542             felec            = _mm_mul_ps(qq20,felec);
543             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
544             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
545             velec            = _mm_mul_ps(qq20,velec);
546
547             /* Update potential sum for this i atom from the interaction with this j atom. */
548             velecsum         = _mm_add_ps(velecsum,velec);
549
550             fscal            = felec;
551
552              /* Update vectorial force */
553             fix2             = _mm_macc_ps(dx20,fscal,fix2);
554             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
555             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
556
557             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
558             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
559             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
560
561             /**************************
562              * CALCULATE INTERACTIONS *
563              **************************/
564
565             r21              = _mm_mul_ps(rsq21,rinv21);
566
567             /* EWALD ELECTROSTATICS */
568
569             /* Analytical PME correction */
570             zeta2            = _mm_mul_ps(beta2,rsq21);
571             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
572             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
573             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
574             felec            = _mm_mul_ps(qq21,felec);
575             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
576             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
577             velec            = _mm_mul_ps(qq21,velec);
578
579             /* Update potential sum for this i atom from the interaction with this j atom. */
580             velecsum         = _mm_add_ps(velecsum,velec);
581
582             fscal            = felec;
583
584              /* Update vectorial force */
585             fix2             = _mm_macc_ps(dx21,fscal,fix2);
586             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
587             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
588
589             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
590             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
591             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
592
593             /**************************
594              * CALCULATE INTERACTIONS *
595              **************************/
596
597             r22              = _mm_mul_ps(rsq22,rinv22);
598
599             /* EWALD ELECTROSTATICS */
600
601             /* Analytical PME correction */
602             zeta2            = _mm_mul_ps(beta2,rsq22);
603             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
604             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
605             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
606             felec            = _mm_mul_ps(qq22,felec);
607             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
608             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
609             velec            = _mm_mul_ps(qq22,velec);
610
611             /* Update potential sum for this i atom from the interaction with this j atom. */
612             velecsum         = _mm_add_ps(velecsum,velec);
613
614             fscal            = felec;
615
616              /* Update vectorial force */
617             fix2             = _mm_macc_ps(dx22,fscal,fix2);
618             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
619             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
620
621             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
622             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
623             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
624
625             fjptrA             = f+j_coord_offsetA;
626             fjptrB             = f+j_coord_offsetB;
627             fjptrC             = f+j_coord_offsetC;
628             fjptrD             = f+j_coord_offsetD;
629
630             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
631                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
632
633             /* Inner loop uses 285 flops */
634         }
635
636         if(jidx<j_index_end)
637         {
638
639             /* Get j neighbor index, and coordinate index */
640             jnrlistA         = jjnr[jidx];
641             jnrlistB         = jjnr[jidx+1];
642             jnrlistC         = jjnr[jidx+2];
643             jnrlistD         = jjnr[jidx+3];
644             /* Sign of each element will be negative for non-real atoms.
645              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
646              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
647              */
648             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
649             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
650             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
651             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
652             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
653             j_coord_offsetA  = DIM*jnrA;
654             j_coord_offsetB  = DIM*jnrB;
655             j_coord_offsetC  = DIM*jnrC;
656             j_coord_offsetD  = DIM*jnrD;
657
658             /* load j atom coordinates */
659             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
660                                               x+j_coord_offsetC,x+j_coord_offsetD,
661                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
662
663             /* Calculate displacement vector */
664             dx00             = _mm_sub_ps(ix0,jx0);
665             dy00             = _mm_sub_ps(iy0,jy0);
666             dz00             = _mm_sub_ps(iz0,jz0);
667             dx01             = _mm_sub_ps(ix0,jx1);
668             dy01             = _mm_sub_ps(iy0,jy1);
669             dz01             = _mm_sub_ps(iz0,jz1);
670             dx02             = _mm_sub_ps(ix0,jx2);
671             dy02             = _mm_sub_ps(iy0,jy2);
672             dz02             = _mm_sub_ps(iz0,jz2);
673             dx10             = _mm_sub_ps(ix1,jx0);
674             dy10             = _mm_sub_ps(iy1,jy0);
675             dz10             = _mm_sub_ps(iz1,jz0);
676             dx11             = _mm_sub_ps(ix1,jx1);
677             dy11             = _mm_sub_ps(iy1,jy1);
678             dz11             = _mm_sub_ps(iz1,jz1);
679             dx12             = _mm_sub_ps(ix1,jx2);
680             dy12             = _mm_sub_ps(iy1,jy2);
681             dz12             = _mm_sub_ps(iz1,jz2);
682             dx20             = _mm_sub_ps(ix2,jx0);
683             dy20             = _mm_sub_ps(iy2,jy0);
684             dz20             = _mm_sub_ps(iz2,jz0);
685             dx21             = _mm_sub_ps(ix2,jx1);
686             dy21             = _mm_sub_ps(iy2,jy1);
687             dz21             = _mm_sub_ps(iz2,jz1);
688             dx22             = _mm_sub_ps(ix2,jx2);
689             dy22             = _mm_sub_ps(iy2,jy2);
690             dz22             = _mm_sub_ps(iz2,jz2);
691
692             /* Calculate squared distance and things based on it */
693             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
694             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
695             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
696             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
697             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
698             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
699             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
700             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
701             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
702
703             rinv00           = gmx_mm_invsqrt_ps(rsq00);
704             rinv01           = gmx_mm_invsqrt_ps(rsq01);
705             rinv02           = gmx_mm_invsqrt_ps(rsq02);
706             rinv10           = gmx_mm_invsqrt_ps(rsq10);
707             rinv11           = gmx_mm_invsqrt_ps(rsq11);
708             rinv12           = gmx_mm_invsqrt_ps(rsq12);
709             rinv20           = gmx_mm_invsqrt_ps(rsq20);
710             rinv21           = gmx_mm_invsqrt_ps(rsq21);
711             rinv22           = gmx_mm_invsqrt_ps(rsq22);
712
713             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
714             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
715             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
716             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
717             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
718             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
719             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
720             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
721             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
722
723             fjx0             = _mm_setzero_ps();
724             fjy0             = _mm_setzero_ps();
725             fjz0             = _mm_setzero_ps();
726             fjx1             = _mm_setzero_ps();
727             fjy1             = _mm_setzero_ps();
728             fjz1             = _mm_setzero_ps();
729             fjx2             = _mm_setzero_ps();
730             fjy2             = _mm_setzero_ps();
731             fjz2             = _mm_setzero_ps();
732
733             /**************************
734              * CALCULATE INTERACTIONS *
735              **************************/
736
737             r00              = _mm_mul_ps(rsq00,rinv00);
738             r00              = _mm_andnot_ps(dummy_mask,r00);
739
740             /* EWALD ELECTROSTATICS */
741
742             /* Analytical PME correction */
743             zeta2            = _mm_mul_ps(beta2,rsq00);
744             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
745             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
746             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
747             felec            = _mm_mul_ps(qq00,felec);
748             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
749             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
750             velec            = _mm_mul_ps(qq00,velec);
751
752             /* Analytical LJ-PME */
753             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
754             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
755             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
756             exponent         = gmx_simd_exp_r(ewcljrsq);
757             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
758             poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
759             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
760             vvdw6            = _mm_mul_ps(_mm_macc_ps(-c6grid_00,_mm_sub_ps(one,poly),c6_00),rinvsix);
761             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
762             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
763             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
764             fvdw             = _mm_mul_ps(_mm_add_ps(vvdw12,_mm_msub_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6),vvdw6)),rinvsq00);
765
766             /* Update potential sum for this i atom from the interaction with this j atom. */
767             velec            = _mm_andnot_ps(dummy_mask,velec);
768             velecsum         = _mm_add_ps(velecsum,velec);
769             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
770             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
771
772             fscal            = _mm_add_ps(felec,fvdw);
773
774             fscal            = _mm_andnot_ps(dummy_mask,fscal);
775
776              /* Update vectorial force */
777             fix0             = _mm_macc_ps(dx00,fscal,fix0);
778             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
779             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
780
781             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
782             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
783             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
784
785             /**************************
786              * CALCULATE INTERACTIONS *
787              **************************/
788
789             r01              = _mm_mul_ps(rsq01,rinv01);
790             r01              = _mm_andnot_ps(dummy_mask,r01);
791
792             /* EWALD ELECTROSTATICS */
793
794             /* Analytical PME correction */
795             zeta2            = _mm_mul_ps(beta2,rsq01);
796             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
797             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
798             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
799             felec            = _mm_mul_ps(qq01,felec);
800             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
801             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
802             velec            = _mm_mul_ps(qq01,velec);
803
804             /* Update potential sum for this i atom from the interaction with this j atom. */
805             velec            = _mm_andnot_ps(dummy_mask,velec);
806             velecsum         = _mm_add_ps(velecsum,velec);
807
808             fscal            = felec;
809
810             fscal            = _mm_andnot_ps(dummy_mask,fscal);
811
812              /* Update vectorial force */
813             fix0             = _mm_macc_ps(dx01,fscal,fix0);
814             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
815             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
816
817             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
818             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
819             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
820
821             /**************************
822              * CALCULATE INTERACTIONS *
823              **************************/
824
825             r02              = _mm_mul_ps(rsq02,rinv02);
826             r02              = _mm_andnot_ps(dummy_mask,r02);
827
828             /* EWALD ELECTROSTATICS */
829
830             /* Analytical PME correction */
831             zeta2            = _mm_mul_ps(beta2,rsq02);
832             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
833             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
834             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
835             felec            = _mm_mul_ps(qq02,felec);
836             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
837             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
838             velec            = _mm_mul_ps(qq02,velec);
839
840             /* Update potential sum for this i atom from the interaction with this j atom. */
841             velec            = _mm_andnot_ps(dummy_mask,velec);
842             velecsum         = _mm_add_ps(velecsum,velec);
843
844             fscal            = felec;
845
846             fscal            = _mm_andnot_ps(dummy_mask,fscal);
847
848              /* Update vectorial force */
849             fix0             = _mm_macc_ps(dx02,fscal,fix0);
850             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
851             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
852
853             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
854             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
855             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
856
857             /**************************
858              * CALCULATE INTERACTIONS *
859              **************************/
860
861             r10              = _mm_mul_ps(rsq10,rinv10);
862             r10              = _mm_andnot_ps(dummy_mask,r10);
863
864             /* EWALD ELECTROSTATICS */
865
866             /* Analytical PME correction */
867             zeta2            = _mm_mul_ps(beta2,rsq10);
868             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
869             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
870             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
871             felec            = _mm_mul_ps(qq10,felec);
872             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
873             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
874             velec            = _mm_mul_ps(qq10,velec);
875
876             /* Update potential sum for this i atom from the interaction with this j atom. */
877             velec            = _mm_andnot_ps(dummy_mask,velec);
878             velecsum         = _mm_add_ps(velecsum,velec);
879
880             fscal            = felec;
881
882             fscal            = _mm_andnot_ps(dummy_mask,fscal);
883
884              /* Update vectorial force */
885             fix1             = _mm_macc_ps(dx10,fscal,fix1);
886             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
887             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
888
889             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
890             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
891             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
892
893             /**************************
894              * CALCULATE INTERACTIONS *
895              **************************/
896
897             r11              = _mm_mul_ps(rsq11,rinv11);
898             r11              = _mm_andnot_ps(dummy_mask,r11);
899
900             /* EWALD ELECTROSTATICS */
901
902             /* Analytical PME correction */
903             zeta2            = _mm_mul_ps(beta2,rsq11);
904             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
905             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
906             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
907             felec            = _mm_mul_ps(qq11,felec);
908             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
909             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
910             velec            = _mm_mul_ps(qq11,velec);
911
912             /* Update potential sum for this i atom from the interaction with this j atom. */
913             velec            = _mm_andnot_ps(dummy_mask,velec);
914             velecsum         = _mm_add_ps(velecsum,velec);
915
916             fscal            = felec;
917
918             fscal            = _mm_andnot_ps(dummy_mask,fscal);
919
920              /* Update vectorial force */
921             fix1             = _mm_macc_ps(dx11,fscal,fix1);
922             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
923             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
924
925             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
926             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
927             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
928
929             /**************************
930              * CALCULATE INTERACTIONS *
931              **************************/
932
933             r12              = _mm_mul_ps(rsq12,rinv12);
934             r12              = _mm_andnot_ps(dummy_mask,r12);
935
936             /* EWALD ELECTROSTATICS */
937
938             /* Analytical PME correction */
939             zeta2            = _mm_mul_ps(beta2,rsq12);
940             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
941             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
942             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
943             felec            = _mm_mul_ps(qq12,felec);
944             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
945             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
946             velec            = _mm_mul_ps(qq12,velec);
947
948             /* Update potential sum for this i atom from the interaction with this j atom. */
949             velec            = _mm_andnot_ps(dummy_mask,velec);
950             velecsum         = _mm_add_ps(velecsum,velec);
951
952             fscal            = felec;
953
954             fscal            = _mm_andnot_ps(dummy_mask,fscal);
955
956              /* Update vectorial force */
957             fix1             = _mm_macc_ps(dx12,fscal,fix1);
958             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
959             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
960
961             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
962             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
963             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
964
965             /**************************
966              * CALCULATE INTERACTIONS *
967              **************************/
968
969             r20              = _mm_mul_ps(rsq20,rinv20);
970             r20              = _mm_andnot_ps(dummy_mask,r20);
971
972             /* EWALD ELECTROSTATICS */
973
974             /* Analytical PME correction */
975             zeta2            = _mm_mul_ps(beta2,rsq20);
976             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
977             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
978             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
979             felec            = _mm_mul_ps(qq20,felec);
980             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
981             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
982             velec            = _mm_mul_ps(qq20,velec);
983
984             /* Update potential sum for this i atom from the interaction with this j atom. */
985             velec            = _mm_andnot_ps(dummy_mask,velec);
986             velecsum         = _mm_add_ps(velecsum,velec);
987
988             fscal            = felec;
989
990             fscal            = _mm_andnot_ps(dummy_mask,fscal);
991
992              /* Update vectorial force */
993             fix2             = _mm_macc_ps(dx20,fscal,fix2);
994             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
995             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
996
997             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
998             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
999             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1000
1001             /**************************
1002              * CALCULATE INTERACTIONS *
1003              **************************/
1004
1005             r21              = _mm_mul_ps(rsq21,rinv21);
1006             r21              = _mm_andnot_ps(dummy_mask,r21);
1007
1008             /* EWALD ELECTROSTATICS */
1009
1010             /* Analytical PME correction */
1011             zeta2            = _mm_mul_ps(beta2,rsq21);
1012             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1013             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1014             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1015             felec            = _mm_mul_ps(qq21,felec);
1016             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1017             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
1018             velec            = _mm_mul_ps(qq21,velec);
1019
1020             /* Update potential sum for this i atom from the interaction with this j atom. */
1021             velec            = _mm_andnot_ps(dummy_mask,velec);
1022             velecsum         = _mm_add_ps(velecsum,velec);
1023
1024             fscal            = felec;
1025
1026             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1027
1028              /* Update vectorial force */
1029             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1030             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1031             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1032
1033             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1034             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1035             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1036
1037             /**************************
1038              * CALCULATE INTERACTIONS *
1039              **************************/
1040
1041             r22              = _mm_mul_ps(rsq22,rinv22);
1042             r22              = _mm_andnot_ps(dummy_mask,r22);
1043
1044             /* EWALD ELECTROSTATICS */
1045
1046             /* Analytical PME correction */
1047             zeta2            = _mm_mul_ps(beta2,rsq22);
1048             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1049             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1050             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1051             felec            = _mm_mul_ps(qq22,felec);
1052             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1053             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1054             velec            = _mm_mul_ps(qq22,velec);
1055
1056             /* Update potential sum for this i atom from the interaction with this j atom. */
1057             velec            = _mm_andnot_ps(dummy_mask,velec);
1058             velecsum         = _mm_add_ps(velecsum,velec);
1059
1060             fscal            = felec;
1061
1062             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1063
1064              /* Update vectorial force */
1065             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1066             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1067             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1068
1069             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1070             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1071             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1072
1073             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1074             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1075             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1076             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1077
1078             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1079                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1080
1081             /* Inner loop uses 294 flops */
1082         }
1083
1084         /* End of innermost loop */
1085
1086         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1087                                               f+i_coord_offset,fshift+i_shift_offset);
1088
1089         ggid                        = gid[iidx];
1090         /* Update potential energies */
1091         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1092         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1093
1094         /* Increment number of inner iterations */
1095         inneriter                  += j_index_end - j_index_start;
1096
1097         /* Outer loop uses 20 flops */
1098     }
1099
1100     /* Increment number of outer iterations */
1101     outeriter        += nri;
1102
1103     /* Update outer/inner flops */
1104
1105     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*294);
1106 }
1107 /*
1108  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_128_fma_single
1109  * Electrostatics interaction: Ewald
1110  * VdW interaction:            LJEwald
1111  * Geometry:                   Water3-Water3
1112  * Calculate force/pot:        Force
1113  */
1114 void
1115 nb_kernel_ElecEw_VdwLJEw_GeomW3W3_F_avx_128_fma_single
1116                     (t_nblist                    * gmx_restrict       nlist,
1117                      rvec                        * gmx_restrict          xx,
1118                      rvec                        * gmx_restrict          ff,
1119                      t_forcerec                  * gmx_restrict          fr,
1120                      t_mdatoms                   * gmx_restrict     mdatoms,
1121                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1122                      t_nrnb                      * gmx_restrict        nrnb)
1123 {
1124     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1125      * just 0 for non-waters.
1126      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1127      * jnr indices corresponding to data put in the four positions in the SIMD register.
1128      */
1129     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1130     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1131     int              jnrA,jnrB,jnrC,jnrD;
1132     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1133     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1134     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1135     real             rcutoff_scalar;
1136     real             *shiftvec,*fshift,*x,*f;
1137     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1138     real             scratch[4*DIM];
1139     __m128           fscal,rcutoff,rcutoff2,jidxall;
1140     int              vdwioffset0;
1141     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1142     int              vdwioffset1;
1143     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1144     int              vdwioffset2;
1145     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1146     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1147     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1148     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1149     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1150     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1151     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1152     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1153     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1154     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1155     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1156     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1157     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1158     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1159     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1160     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1161     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1162     real             *charge;
1163     int              nvdwtype;
1164     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1165     int              *vdwtype;
1166     real             *vdwparam;
1167     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1168     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1169     __m128           c6grid_00;
1170     __m128           c6grid_01;
1171     __m128           c6grid_02;
1172     __m128           c6grid_10;
1173     __m128           c6grid_11;
1174     __m128           c6grid_12;
1175     __m128           c6grid_20;
1176     __m128           c6grid_21;
1177     __m128           c6grid_22;
1178     real             *vdwgridparam;
1179     __m128           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
1180     __m128           one_half = _mm_set1_ps(0.5);
1181     __m128           minus_one = _mm_set1_ps(-1.0);
1182     __m128i          ewitab;
1183     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1184     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1185     real             *ewtab;
1186     __m128           dummy_mask,cutoff_mask;
1187     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1188     __m128           one     = _mm_set1_ps(1.0);
1189     __m128           two     = _mm_set1_ps(2.0);
1190     x                = xx[0];
1191     f                = ff[0];
1192
1193     nri              = nlist->nri;
1194     iinr             = nlist->iinr;
1195     jindex           = nlist->jindex;
1196     jjnr             = nlist->jjnr;
1197     shiftidx         = nlist->shift;
1198     gid              = nlist->gid;
1199     shiftvec         = fr->shift_vec[0];
1200     fshift           = fr->fshift[0];
1201     facel            = _mm_set1_ps(fr->epsfac);
1202     charge           = mdatoms->chargeA;
1203     nvdwtype         = fr->ntype;
1204     vdwparam         = fr->nbfp;
1205     vdwtype          = mdatoms->typeA;
1206     vdwgridparam     = fr->ljpme_c6grid;
1207     sh_lj_ewald      = _mm_set1_ps(fr->ic->sh_lj_ewald);
1208     ewclj            = _mm_set1_ps(fr->ewaldcoeff_lj);
1209     ewclj2           = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));
1210
1211     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1212     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1213     beta2            = _mm_mul_ps(beta,beta);
1214     beta3            = _mm_mul_ps(beta,beta2);
1215     ewtab            = fr->ic->tabq_coul_F;
1216     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1217     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1218
1219     /* Setup water-specific parameters */
1220     inr              = nlist->iinr[0];
1221     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1222     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1223     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1224     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1225
1226     jq0              = _mm_set1_ps(charge[inr+0]);
1227     jq1              = _mm_set1_ps(charge[inr+1]);
1228     jq2              = _mm_set1_ps(charge[inr+2]);
1229     vdwjidx0A        = 2*vdwtype[inr+0];
1230     qq00             = _mm_mul_ps(iq0,jq0);
1231     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1232     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1233     c6grid_00        = _mm_set1_ps(vdwgridparam[vdwioffset0+vdwjidx0A]);
1234     qq01             = _mm_mul_ps(iq0,jq1);
1235     qq02             = _mm_mul_ps(iq0,jq2);
1236     qq10             = _mm_mul_ps(iq1,jq0);
1237     qq11             = _mm_mul_ps(iq1,jq1);
1238     qq12             = _mm_mul_ps(iq1,jq2);
1239     qq20             = _mm_mul_ps(iq2,jq0);
1240     qq21             = _mm_mul_ps(iq2,jq1);
1241     qq22             = _mm_mul_ps(iq2,jq2);
1242
1243     /* Avoid stupid compiler warnings */
1244     jnrA = jnrB = jnrC = jnrD = 0;
1245     j_coord_offsetA = 0;
1246     j_coord_offsetB = 0;
1247     j_coord_offsetC = 0;
1248     j_coord_offsetD = 0;
1249
1250     outeriter        = 0;
1251     inneriter        = 0;
1252
1253     for(iidx=0;iidx<4*DIM;iidx++)
1254     {
1255         scratch[iidx] = 0.0;
1256     }
1257
1258     /* Start outer loop over neighborlists */
1259     for(iidx=0; iidx<nri; iidx++)
1260     {
1261         /* Load shift vector for this list */
1262         i_shift_offset   = DIM*shiftidx[iidx];
1263
1264         /* Load limits for loop over neighbors */
1265         j_index_start    = jindex[iidx];
1266         j_index_end      = jindex[iidx+1];
1267
1268         /* Get outer coordinate index */
1269         inr              = iinr[iidx];
1270         i_coord_offset   = DIM*inr;
1271
1272         /* Load i particle coords and add shift vector */
1273         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1274                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1275
1276         fix0             = _mm_setzero_ps();
1277         fiy0             = _mm_setzero_ps();
1278         fiz0             = _mm_setzero_ps();
1279         fix1             = _mm_setzero_ps();
1280         fiy1             = _mm_setzero_ps();
1281         fiz1             = _mm_setzero_ps();
1282         fix2             = _mm_setzero_ps();
1283         fiy2             = _mm_setzero_ps();
1284         fiz2             = _mm_setzero_ps();
1285
1286         /* Start inner kernel loop */
1287         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1288         {
1289
1290             /* Get j neighbor index, and coordinate index */
1291             jnrA             = jjnr[jidx];
1292             jnrB             = jjnr[jidx+1];
1293             jnrC             = jjnr[jidx+2];
1294             jnrD             = jjnr[jidx+3];
1295             j_coord_offsetA  = DIM*jnrA;
1296             j_coord_offsetB  = DIM*jnrB;
1297             j_coord_offsetC  = DIM*jnrC;
1298             j_coord_offsetD  = DIM*jnrD;
1299
1300             /* load j atom coordinates */
1301             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1302                                               x+j_coord_offsetC,x+j_coord_offsetD,
1303                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1304
1305             /* Calculate displacement vector */
1306             dx00             = _mm_sub_ps(ix0,jx0);
1307             dy00             = _mm_sub_ps(iy0,jy0);
1308             dz00             = _mm_sub_ps(iz0,jz0);
1309             dx01             = _mm_sub_ps(ix0,jx1);
1310             dy01             = _mm_sub_ps(iy0,jy1);
1311             dz01             = _mm_sub_ps(iz0,jz1);
1312             dx02             = _mm_sub_ps(ix0,jx2);
1313             dy02             = _mm_sub_ps(iy0,jy2);
1314             dz02             = _mm_sub_ps(iz0,jz2);
1315             dx10             = _mm_sub_ps(ix1,jx0);
1316             dy10             = _mm_sub_ps(iy1,jy0);
1317             dz10             = _mm_sub_ps(iz1,jz0);
1318             dx11             = _mm_sub_ps(ix1,jx1);
1319             dy11             = _mm_sub_ps(iy1,jy1);
1320             dz11             = _mm_sub_ps(iz1,jz1);
1321             dx12             = _mm_sub_ps(ix1,jx2);
1322             dy12             = _mm_sub_ps(iy1,jy2);
1323             dz12             = _mm_sub_ps(iz1,jz2);
1324             dx20             = _mm_sub_ps(ix2,jx0);
1325             dy20             = _mm_sub_ps(iy2,jy0);
1326             dz20             = _mm_sub_ps(iz2,jz0);
1327             dx21             = _mm_sub_ps(ix2,jx1);
1328             dy21             = _mm_sub_ps(iy2,jy1);
1329             dz21             = _mm_sub_ps(iz2,jz1);
1330             dx22             = _mm_sub_ps(ix2,jx2);
1331             dy22             = _mm_sub_ps(iy2,jy2);
1332             dz22             = _mm_sub_ps(iz2,jz2);
1333
1334             /* Calculate squared distance and things based on it */
1335             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1336             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1337             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1338             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1339             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1340             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1341             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1342             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1343             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1344
1345             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1346             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1347             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1348             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1349             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1350             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1351             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1352             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1353             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1354
1355             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1356             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1357             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1358             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1359             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1360             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1361             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1362             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1363             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1364
1365             fjx0             = _mm_setzero_ps();
1366             fjy0             = _mm_setzero_ps();
1367             fjz0             = _mm_setzero_ps();
1368             fjx1             = _mm_setzero_ps();
1369             fjy1             = _mm_setzero_ps();
1370             fjz1             = _mm_setzero_ps();
1371             fjx2             = _mm_setzero_ps();
1372             fjy2             = _mm_setzero_ps();
1373             fjz2             = _mm_setzero_ps();
1374
1375             /**************************
1376              * CALCULATE INTERACTIONS *
1377              **************************/
1378
1379             r00              = _mm_mul_ps(rsq00,rinv00);
1380
1381             /* EWALD ELECTROSTATICS */
1382
1383             /* Analytical PME correction */
1384             zeta2            = _mm_mul_ps(beta2,rsq00);
1385             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
1386             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1387             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1388             felec            = _mm_mul_ps(qq00,felec);
1389
1390             /* Analytical LJ-PME */
1391             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1392             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
1393             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1394             exponent         = gmx_simd_exp_r(ewcljrsq);
1395             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1396             poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
1397             /* f6A = 6 * C6grid * (1 - poly) */
1398             f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1399             /* f6B = C6grid * exponent * beta^6 */
1400             f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1401             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1402             fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
1403
1404             fscal            = _mm_add_ps(felec,fvdw);
1405
1406              /* Update vectorial force */
1407             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1408             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1409             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1410
1411             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1412             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1413             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1414
1415             /**************************
1416              * CALCULATE INTERACTIONS *
1417              **************************/
1418
1419             r01              = _mm_mul_ps(rsq01,rinv01);
1420
1421             /* EWALD ELECTROSTATICS */
1422
1423             /* Analytical PME correction */
1424             zeta2            = _mm_mul_ps(beta2,rsq01);
1425             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
1426             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1427             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1428             felec            = _mm_mul_ps(qq01,felec);
1429
1430             fscal            = felec;
1431
1432              /* Update vectorial force */
1433             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1434             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1435             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1436
1437             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1438             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1439             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1440
1441             /**************************
1442              * CALCULATE INTERACTIONS *
1443              **************************/
1444
1445             r02              = _mm_mul_ps(rsq02,rinv02);
1446
1447             /* EWALD ELECTROSTATICS */
1448
1449             /* Analytical PME correction */
1450             zeta2            = _mm_mul_ps(beta2,rsq02);
1451             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
1452             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1453             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1454             felec            = _mm_mul_ps(qq02,felec);
1455
1456             fscal            = felec;
1457
1458              /* Update vectorial force */
1459             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1460             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1461             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1462
1463             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1464             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1465             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1466
1467             /**************************
1468              * CALCULATE INTERACTIONS *
1469              **************************/
1470
1471             r10              = _mm_mul_ps(rsq10,rinv10);
1472
1473             /* EWALD ELECTROSTATICS */
1474
1475             /* Analytical PME correction */
1476             zeta2            = _mm_mul_ps(beta2,rsq10);
1477             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1478             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1479             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1480             felec            = _mm_mul_ps(qq10,felec);
1481
1482             fscal            = felec;
1483
1484              /* Update vectorial force */
1485             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1486             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1487             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1488
1489             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1490             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1491             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1492
1493             /**************************
1494              * CALCULATE INTERACTIONS *
1495              **************************/
1496
1497             r11              = _mm_mul_ps(rsq11,rinv11);
1498
1499             /* EWALD ELECTROSTATICS */
1500
1501             /* Analytical PME correction */
1502             zeta2            = _mm_mul_ps(beta2,rsq11);
1503             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1504             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1505             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1506             felec            = _mm_mul_ps(qq11,felec);
1507
1508             fscal            = felec;
1509
1510              /* Update vectorial force */
1511             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1512             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1513             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1514
1515             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1516             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1517             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1518
1519             /**************************
1520              * CALCULATE INTERACTIONS *
1521              **************************/
1522
1523             r12              = _mm_mul_ps(rsq12,rinv12);
1524
1525             /* EWALD ELECTROSTATICS */
1526
1527             /* Analytical PME correction */
1528             zeta2            = _mm_mul_ps(beta2,rsq12);
1529             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1530             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1531             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1532             felec            = _mm_mul_ps(qq12,felec);
1533
1534             fscal            = felec;
1535
1536              /* Update vectorial force */
1537             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1538             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1539             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1540
1541             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1542             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1543             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1544
1545             /**************************
1546              * CALCULATE INTERACTIONS *
1547              **************************/
1548
1549             r20              = _mm_mul_ps(rsq20,rinv20);
1550
1551             /* EWALD ELECTROSTATICS */
1552
1553             /* Analytical PME correction */
1554             zeta2            = _mm_mul_ps(beta2,rsq20);
1555             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
1556             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1557             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1558             felec            = _mm_mul_ps(qq20,felec);
1559
1560             fscal            = felec;
1561
1562              /* Update vectorial force */
1563             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1564             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1565             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1566
1567             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1568             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1569             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1570
1571             /**************************
1572              * CALCULATE INTERACTIONS *
1573              **************************/
1574
1575             r21              = _mm_mul_ps(rsq21,rinv21);
1576
1577             /* EWALD ELECTROSTATICS */
1578
1579             /* Analytical PME correction */
1580             zeta2            = _mm_mul_ps(beta2,rsq21);
1581             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1582             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1583             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1584             felec            = _mm_mul_ps(qq21,felec);
1585
1586             fscal            = felec;
1587
1588              /* Update vectorial force */
1589             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1590             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1591             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1592
1593             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1594             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1595             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1596
1597             /**************************
1598              * CALCULATE INTERACTIONS *
1599              **************************/
1600
1601             r22              = _mm_mul_ps(rsq22,rinv22);
1602
1603             /* EWALD ELECTROSTATICS */
1604
1605             /* Analytical PME correction */
1606             zeta2            = _mm_mul_ps(beta2,rsq22);
1607             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1608             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1609             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1610             felec            = _mm_mul_ps(qq22,felec);
1611
1612             fscal            = felec;
1613
1614              /* Update vectorial force */
1615             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1616             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1617             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1618
1619             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1620             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1621             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1622
1623             fjptrA             = f+j_coord_offsetA;
1624             fjptrB             = f+j_coord_offsetB;
1625             fjptrC             = f+j_coord_offsetC;
1626             fjptrD             = f+j_coord_offsetD;
1627
1628             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1629                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1630
1631             /* Inner loop uses 273 flops */
1632         }
1633
1634         if(jidx<j_index_end)
1635         {
1636
1637             /* Get j neighbor index, and coordinate index */
1638             jnrlistA         = jjnr[jidx];
1639             jnrlistB         = jjnr[jidx+1];
1640             jnrlistC         = jjnr[jidx+2];
1641             jnrlistD         = jjnr[jidx+3];
1642             /* Sign of each element will be negative for non-real atoms.
1643              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1644              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1645              */
1646             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1647             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1648             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1649             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1650             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1651             j_coord_offsetA  = DIM*jnrA;
1652             j_coord_offsetB  = DIM*jnrB;
1653             j_coord_offsetC  = DIM*jnrC;
1654             j_coord_offsetD  = DIM*jnrD;
1655
1656             /* load j atom coordinates */
1657             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1658                                               x+j_coord_offsetC,x+j_coord_offsetD,
1659                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1660
1661             /* Calculate displacement vector */
1662             dx00             = _mm_sub_ps(ix0,jx0);
1663             dy00             = _mm_sub_ps(iy0,jy0);
1664             dz00             = _mm_sub_ps(iz0,jz0);
1665             dx01             = _mm_sub_ps(ix0,jx1);
1666             dy01             = _mm_sub_ps(iy0,jy1);
1667             dz01             = _mm_sub_ps(iz0,jz1);
1668             dx02             = _mm_sub_ps(ix0,jx2);
1669             dy02             = _mm_sub_ps(iy0,jy2);
1670             dz02             = _mm_sub_ps(iz0,jz2);
1671             dx10             = _mm_sub_ps(ix1,jx0);
1672             dy10             = _mm_sub_ps(iy1,jy0);
1673             dz10             = _mm_sub_ps(iz1,jz0);
1674             dx11             = _mm_sub_ps(ix1,jx1);
1675             dy11             = _mm_sub_ps(iy1,jy1);
1676             dz11             = _mm_sub_ps(iz1,jz1);
1677             dx12             = _mm_sub_ps(ix1,jx2);
1678             dy12             = _mm_sub_ps(iy1,jy2);
1679             dz12             = _mm_sub_ps(iz1,jz2);
1680             dx20             = _mm_sub_ps(ix2,jx0);
1681             dy20             = _mm_sub_ps(iy2,jy0);
1682             dz20             = _mm_sub_ps(iz2,jz0);
1683             dx21             = _mm_sub_ps(ix2,jx1);
1684             dy21             = _mm_sub_ps(iy2,jy1);
1685             dz21             = _mm_sub_ps(iz2,jz1);
1686             dx22             = _mm_sub_ps(ix2,jx2);
1687             dy22             = _mm_sub_ps(iy2,jy2);
1688             dz22             = _mm_sub_ps(iz2,jz2);
1689
1690             /* Calculate squared distance and things based on it */
1691             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1692             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1693             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1694             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1695             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1696             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1697             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1698             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1699             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1700
1701             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1702             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1703             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1704             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1705             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1706             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1707             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1708             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1709             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1710
1711             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1712             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1713             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1714             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1715             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1716             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1717             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1718             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1719             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1720
1721             fjx0             = _mm_setzero_ps();
1722             fjy0             = _mm_setzero_ps();
1723             fjz0             = _mm_setzero_ps();
1724             fjx1             = _mm_setzero_ps();
1725             fjy1             = _mm_setzero_ps();
1726             fjz1             = _mm_setzero_ps();
1727             fjx2             = _mm_setzero_ps();
1728             fjy2             = _mm_setzero_ps();
1729             fjz2             = _mm_setzero_ps();
1730
1731             /**************************
1732              * CALCULATE INTERACTIONS *
1733              **************************/
1734
1735             r00              = _mm_mul_ps(rsq00,rinv00);
1736             r00              = _mm_andnot_ps(dummy_mask,r00);
1737
1738             /* EWALD ELECTROSTATICS */
1739
1740             /* Analytical PME correction */
1741             zeta2            = _mm_mul_ps(beta2,rsq00);
1742             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
1743             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1744             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1745             felec            = _mm_mul_ps(qq00,felec);
1746
1747             /* Analytical LJ-PME */
1748             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1749             ewcljrsq         = _mm_mul_ps(ewclj2,rsq00);
1750             ewclj6           = _mm_mul_ps(ewclj2,_mm_mul_ps(ewclj2,ewclj2));
1751             exponent         = gmx_simd_exp_r(ewcljrsq);
1752             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
1753             poly             = _mm_mul_ps(exponent,_mm_macc_ps(_mm_mul_ps(ewcljrsq,ewcljrsq),one_half,_mm_sub_ps(one,ewcljrsq)));
1754             /* f6A = 6 * C6grid * (1 - poly) */
1755             f6A              = _mm_mul_ps(c6grid_00,_mm_sub_ps(one,poly));
1756             /* f6B = C6grid * exponent * beta^6 */
1757             f6B              = _mm_mul_ps(_mm_mul_ps(c6grid_00,one_sixth),_mm_mul_ps(exponent,ewclj6));
1758             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
1759             fvdw              = _mm_mul_ps(_mm_macc_ps(_mm_msub_ps(c12_00,rinvsix,_mm_sub_ps(c6_00,f6A)),rinvsix,f6B),rinvsq00);
1760
1761             fscal            = _mm_add_ps(felec,fvdw);
1762
1763             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1764
1765              /* Update vectorial force */
1766             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1767             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1768             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1769
1770             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1771             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1772             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1773
1774             /**************************
1775              * CALCULATE INTERACTIONS *
1776              **************************/
1777
1778             r01              = _mm_mul_ps(rsq01,rinv01);
1779             r01              = _mm_andnot_ps(dummy_mask,r01);
1780
1781             /* EWALD ELECTROSTATICS */
1782
1783             /* Analytical PME correction */
1784             zeta2            = _mm_mul_ps(beta2,rsq01);
1785             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
1786             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1787             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1788             felec            = _mm_mul_ps(qq01,felec);
1789
1790             fscal            = felec;
1791
1792             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1793
1794              /* Update vectorial force */
1795             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1796             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1797             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1798
1799             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1800             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1801             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1802
1803             /**************************
1804              * CALCULATE INTERACTIONS *
1805              **************************/
1806
1807             r02              = _mm_mul_ps(rsq02,rinv02);
1808             r02              = _mm_andnot_ps(dummy_mask,r02);
1809
1810             /* EWALD ELECTROSTATICS */
1811
1812             /* Analytical PME correction */
1813             zeta2            = _mm_mul_ps(beta2,rsq02);
1814             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
1815             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1816             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1817             felec            = _mm_mul_ps(qq02,felec);
1818
1819             fscal            = felec;
1820
1821             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1822
1823              /* Update vectorial force */
1824             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1825             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1826             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1827
1828             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1829             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1830             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1831
1832             /**************************
1833              * CALCULATE INTERACTIONS *
1834              **************************/
1835
1836             r10              = _mm_mul_ps(rsq10,rinv10);
1837             r10              = _mm_andnot_ps(dummy_mask,r10);
1838
1839             /* EWALD ELECTROSTATICS */
1840
1841             /* Analytical PME correction */
1842             zeta2            = _mm_mul_ps(beta2,rsq10);
1843             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1844             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1845             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1846             felec            = _mm_mul_ps(qq10,felec);
1847
1848             fscal            = felec;
1849
1850             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1851
1852              /* Update vectorial force */
1853             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1854             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1855             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1856
1857             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1858             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1859             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1860
1861             /**************************
1862              * CALCULATE INTERACTIONS *
1863              **************************/
1864
1865             r11              = _mm_mul_ps(rsq11,rinv11);
1866             r11              = _mm_andnot_ps(dummy_mask,r11);
1867
1868             /* EWALD ELECTROSTATICS */
1869
1870             /* Analytical PME correction */
1871             zeta2            = _mm_mul_ps(beta2,rsq11);
1872             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1873             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1874             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1875             felec            = _mm_mul_ps(qq11,felec);
1876
1877             fscal            = felec;
1878
1879             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1880
1881              /* Update vectorial force */
1882             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1883             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1884             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1885
1886             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1887             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1888             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1889
1890             /**************************
1891              * CALCULATE INTERACTIONS *
1892              **************************/
1893
1894             r12              = _mm_mul_ps(rsq12,rinv12);
1895             r12              = _mm_andnot_ps(dummy_mask,r12);
1896
1897             /* EWALD ELECTROSTATICS */
1898
1899             /* Analytical PME correction */
1900             zeta2            = _mm_mul_ps(beta2,rsq12);
1901             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1902             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1903             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1904             felec            = _mm_mul_ps(qq12,felec);
1905
1906             fscal            = felec;
1907
1908             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1909
1910              /* Update vectorial force */
1911             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1912             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1913             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1914
1915             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1916             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1917             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1918
1919             /**************************
1920              * CALCULATE INTERACTIONS *
1921              **************************/
1922
1923             r20              = _mm_mul_ps(rsq20,rinv20);
1924             r20              = _mm_andnot_ps(dummy_mask,r20);
1925
1926             /* EWALD ELECTROSTATICS */
1927
1928             /* Analytical PME correction */
1929             zeta2            = _mm_mul_ps(beta2,rsq20);
1930             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
1931             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1932             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1933             felec            = _mm_mul_ps(qq20,felec);
1934
1935             fscal            = felec;
1936
1937             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1938
1939              /* Update vectorial force */
1940             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1941             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1942             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1943
1944             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1945             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1946             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1947
1948             /**************************
1949              * CALCULATE INTERACTIONS *
1950              **************************/
1951
1952             r21              = _mm_mul_ps(rsq21,rinv21);
1953             r21              = _mm_andnot_ps(dummy_mask,r21);
1954
1955             /* EWALD ELECTROSTATICS */
1956
1957             /* Analytical PME correction */
1958             zeta2            = _mm_mul_ps(beta2,rsq21);
1959             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1960             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1961             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1962             felec            = _mm_mul_ps(qq21,felec);
1963
1964             fscal            = felec;
1965
1966             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1967
1968              /* Update vectorial force */
1969             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1970             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1971             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1972
1973             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1974             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1975             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1976
1977             /**************************
1978              * CALCULATE INTERACTIONS *
1979              **************************/
1980
1981             r22              = _mm_mul_ps(rsq22,rinv22);
1982             r22              = _mm_andnot_ps(dummy_mask,r22);
1983
1984             /* EWALD ELECTROSTATICS */
1985
1986             /* Analytical PME correction */
1987             zeta2            = _mm_mul_ps(beta2,rsq22);
1988             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1989             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1990             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1991             felec            = _mm_mul_ps(qq22,felec);
1992
1993             fscal            = felec;
1994
1995             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1996
1997              /* Update vectorial force */
1998             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1999             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2000             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2001
2002             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2003             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2004             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2005
2006             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2007             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2008             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2009             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2010
2011             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2012                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2013
2014             /* Inner loop uses 282 flops */
2015         }
2016
2017         /* End of innermost loop */
2018
2019         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2020                                               f+i_coord_offset,fshift+i_shift_offset);
2021
2022         /* Increment number of inner iterations */
2023         inneriter                  += j_index_end - j_index_start;
2024
2025         /* Outer loop uses 18 flops */
2026     }
2027
2028     /* Increment number of outer iterations */
2029     outeriter        += nri;
2030
2031     /* Update outer/inner flops */
2032
2033     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*282);
2034 }