Added option to gmx nmeig to print ZPE.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_128_fma_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_single
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80     real             scratch[4*DIM];
81     __m128           fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
104     real             *charge;
105     int              nvdwtype;
106     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
107     int              *vdwtype;
108     real             *vdwparam;
109     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
110     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
111     __m128i          ewitab;
112     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
113     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
114     real             *ewtab;
115     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
116     real             rswitch_scalar,d_scalar;
117     __m128           dummy_mask,cutoff_mask;
118     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
119     __m128           one     = _mm_set1_ps(1.0);
120     __m128           two     = _mm_set1_ps(2.0);
121     x                = xx[0];
122     f                = ff[0];
123
124     nri              = nlist->nri;
125     iinr             = nlist->iinr;
126     jindex           = nlist->jindex;
127     jjnr             = nlist->jjnr;
128     shiftidx         = nlist->shift;
129     gid              = nlist->gid;
130     shiftvec         = fr->shift_vec[0];
131     fshift           = fr->fshift[0];
132     facel            = _mm_set1_ps(fr->ic->epsfac);
133     charge           = mdatoms->chargeA;
134     nvdwtype         = fr->ntype;
135     vdwparam         = fr->nbfp;
136     vdwtype          = mdatoms->typeA;
137
138     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
139     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
140     beta2            = _mm_mul_ps(beta,beta);
141     beta3            = _mm_mul_ps(beta,beta2);
142     ewtab            = fr->ic->tabq_coul_FDV0;
143     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
144     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
145
146     /* Setup water-specific parameters */
147     inr              = nlist->iinr[0];
148     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
149     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
150     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
151     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
152
153     jq0              = _mm_set1_ps(charge[inr+0]);
154     jq1              = _mm_set1_ps(charge[inr+1]);
155     jq2              = _mm_set1_ps(charge[inr+2]);
156     vdwjidx0A        = 2*vdwtype[inr+0];
157     qq00             = _mm_mul_ps(iq0,jq0);
158     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
159     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
160     qq01             = _mm_mul_ps(iq0,jq1);
161     qq02             = _mm_mul_ps(iq0,jq2);
162     qq10             = _mm_mul_ps(iq1,jq0);
163     qq11             = _mm_mul_ps(iq1,jq1);
164     qq12             = _mm_mul_ps(iq1,jq2);
165     qq20             = _mm_mul_ps(iq2,jq0);
166     qq21             = _mm_mul_ps(iq2,jq1);
167     qq22             = _mm_mul_ps(iq2,jq2);
168
169     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
170     rcutoff_scalar   = fr->ic->rcoulomb;
171     rcutoff          = _mm_set1_ps(rcutoff_scalar);
172     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
173
174     rswitch_scalar   = fr->ic->rcoulomb_switch;
175     rswitch          = _mm_set1_ps(rswitch_scalar);
176     /* Setup switch parameters */
177     d_scalar         = rcutoff_scalar-rswitch_scalar;
178     d                = _mm_set1_ps(d_scalar);
179     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
180     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
181     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
182     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
183     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
184     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
185
186     /* Avoid stupid compiler warnings */
187     jnrA = jnrB = jnrC = jnrD = 0;
188     j_coord_offsetA = 0;
189     j_coord_offsetB = 0;
190     j_coord_offsetC = 0;
191     j_coord_offsetD = 0;
192
193     outeriter        = 0;
194     inneriter        = 0;
195
196     for(iidx=0;iidx<4*DIM;iidx++)
197     {
198         scratch[iidx] = 0.0;
199     }
200
201     /* Start outer loop over neighborlists */
202     for(iidx=0; iidx<nri; iidx++)
203     {
204         /* Load shift vector for this list */
205         i_shift_offset   = DIM*shiftidx[iidx];
206
207         /* Load limits for loop over neighbors */
208         j_index_start    = jindex[iidx];
209         j_index_end      = jindex[iidx+1];
210
211         /* Get outer coordinate index */
212         inr              = iinr[iidx];
213         i_coord_offset   = DIM*inr;
214
215         /* Load i particle coords and add shift vector */
216         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
217                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
218
219         fix0             = _mm_setzero_ps();
220         fiy0             = _mm_setzero_ps();
221         fiz0             = _mm_setzero_ps();
222         fix1             = _mm_setzero_ps();
223         fiy1             = _mm_setzero_ps();
224         fiz1             = _mm_setzero_ps();
225         fix2             = _mm_setzero_ps();
226         fiy2             = _mm_setzero_ps();
227         fiz2             = _mm_setzero_ps();
228
229         /* Reset potential sums */
230         velecsum         = _mm_setzero_ps();
231         vvdwsum          = _mm_setzero_ps();
232
233         /* Start inner kernel loop */
234         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
235         {
236
237             /* Get j neighbor index, and coordinate index */
238             jnrA             = jjnr[jidx];
239             jnrB             = jjnr[jidx+1];
240             jnrC             = jjnr[jidx+2];
241             jnrD             = jjnr[jidx+3];
242             j_coord_offsetA  = DIM*jnrA;
243             j_coord_offsetB  = DIM*jnrB;
244             j_coord_offsetC  = DIM*jnrC;
245             j_coord_offsetD  = DIM*jnrD;
246
247             /* load j atom coordinates */
248             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
249                                               x+j_coord_offsetC,x+j_coord_offsetD,
250                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
251
252             /* Calculate displacement vector */
253             dx00             = _mm_sub_ps(ix0,jx0);
254             dy00             = _mm_sub_ps(iy0,jy0);
255             dz00             = _mm_sub_ps(iz0,jz0);
256             dx01             = _mm_sub_ps(ix0,jx1);
257             dy01             = _mm_sub_ps(iy0,jy1);
258             dz01             = _mm_sub_ps(iz0,jz1);
259             dx02             = _mm_sub_ps(ix0,jx2);
260             dy02             = _mm_sub_ps(iy0,jy2);
261             dz02             = _mm_sub_ps(iz0,jz2);
262             dx10             = _mm_sub_ps(ix1,jx0);
263             dy10             = _mm_sub_ps(iy1,jy0);
264             dz10             = _mm_sub_ps(iz1,jz0);
265             dx11             = _mm_sub_ps(ix1,jx1);
266             dy11             = _mm_sub_ps(iy1,jy1);
267             dz11             = _mm_sub_ps(iz1,jz1);
268             dx12             = _mm_sub_ps(ix1,jx2);
269             dy12             = _mm_sub_ps(iy1,jy2);
270             dz12             = _mm_sub_ps(iz1,jz2);
271             dx20             = _mm_sub_ps(ix2,jx0);
272             dy20             = _mm_sub_ps(iy2,jy0);
273             dz20             = _mm_sub_ps(iz2,jz0);
274             dx21             = _mm_sub_ps(ix2,jx1);
275             dy21             = _mm_sub_ps(iy2,jy1);
276             dz21             = _mm_sub_ps(iz2,jz1);
277             dx22             = _mm_sub_ps(ix2,jx2);
278             dy22             = _mm_sub_ps(iy2,jy2);
279             dz22             = _mm_sub_ps(iz2,jz2);
280
281             /* Calculate squared distance and things based on it */
282             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
283             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
284             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
285             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
286             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
287             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
288             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
289             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
290             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
291
292             rinv00           = avx128fma_invsqrt_f(rsq00);
293             rinv01           = avx128fma_invsqrt_f(rsq01);
294             rinv02           = avx128fma_invsqrt_f(rsq02);
295             rinv10           = avx128fma_invsqrt_f(rsq10);
296             rinv11           = avx128fma_invsqrt_f(rsq11);
297             rinv12           = avx128fma_invsqrt_f(rsq12);
298             rinv20           = avx128fma_invsqrt_f(rsq20);
299             rinv21           = avx128fma_invsqrt_f(rsq21);
300             rinv22           = avx128fma_invsqrt_f(rsq22);
301
302             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
303             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
304             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
305             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
306             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
307             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
308             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
309             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
310             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
311
312             fjx0             = _mm_setzero_ps();
313             fjy0             = _mm_setzero_ps();
314             fjz0             = _mm_setzero_ps();
315             fjx1             = _mm_setzero_ps();
316             fjy1             = _mm_setzero_ps();
317             fjz1             = _mm_setzero_ps();
318             fjx2             = _mm_setzero_ps();
319             fjy2             = _mm_setzero_ps();
320             fjz2             = _mm_setzero_ps();
321
322             /**************************
323              * CALCULATE INTERACTIONS *
324              **************************/
325
326             if (gmx_mm_any_lt(rsq00,rcutoff2))
327             {
328
329             r00              = _mm_mul_ps(rsq00,rinv00);
330
331             /* EWALD ELECTROSTATICS */
332
333             /* Analytical PME correction */
334             zeta2            = _mm_mul_ps(beta2,rsq00);
335             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
336             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
337             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
338             felec            = _mm_mul_ps(qq00,felec);
339             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
340             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
341             velec            = _mm_mul_ps(qq00,velec);
342
343             /* LENNARD-JONES DISPERSION/REPULSION */
344
345             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
346             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
347             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
348             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
349             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
350
351             d                = _mm_sub_ps(r00,rswitch);
352             d                = _mm_max_ps(d,_mm_setzero_ps());
353             d2               = _mm_mul_ps(d,d);
354             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
355
356             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
357
358             /* Evaluate switch function */
359             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
360             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
361             fvdw             = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
362             velec            = _mm_mul_ps(velec,sw);
363             vvdw             = _mm_mul_ps(vvdw,sw);
364             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
365
366             /* Update potential sum for this i atom from the interaction with this j atom. */
367             velec            = _mm_and_ps(velec,cutoff_mask);
368             velecsum         = _mm_add_ps(velecsum,velec);
369             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
370             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
371
372             fscal            = _mm_add_ps(felec,fvdw);
373
374             fscal            = _mm_and_ps(fscal,cutoff_mask);
375
376              /* Update vectorial force */
377             fix0             = _mm_macc_ps(dx00,fscal,fix0);
378             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
379             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
380
381             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
382             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
383             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
384
385             }
386
387             /**************************
388              * CALCULATE INTERACTIONS *
389              **************************/
390
391             if (gmx_mm_any_lt(rsq01,rcutoff2))
392             {
393
394             r01              = _mm_mul_ps(rsq01,rinv01);
395
396             /* EWALD ELECTROSTATICS */
397
398             /* Analytical PME correction */
399             zeta2            = _mm_mul_ps(beta2,rsq01);
400             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
401             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
402             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
403             felec            = _mm_mul_ps(qq01,felec);
404             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
405             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
406             velec            = _mm_mul_ps(qq01,velec);
407
408             d                = _mm_sub_ps(r01,rswitch);
409             d                = _mm_max_ps(d,_mm_setzero_ps());
410             d2               = _mm_mul_ps(d,d);
411             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
412
413             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
414
415             /* Evaluate switch function */
416             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
417             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
418             velec            = _mm_mul_ps(velec,sw);
419             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
420
421             /* Update potential sum for this i atom from the interaction with this j atom. */
422             velec            = _mm_and_ps(velec,cutoff_mask);
423             velecsum         = _mm_add_ps(velecsum,velec);
424
425             fscal            = felec;
426
427             fscal            = _mm_and_ps(fscal,cutoff_mask);
428
429              /* Update vectorial force */
430             fix0             = _mm_macc_ps(dx01,fscal,fix0);
431             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
432             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
433
434             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
435             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
436             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
437
438             }
439
440             /**************************
441              * CALCULATE INTERACTIONS *
442              **************************/
443
444             if (gmx_mm_any_lt(rsq02,rcutoff2))
445             {
446
447             r02              = _mm_mul_ps(rsq02,rinv02);
448
449             /* EWALD ELECTROSTATICS */
450
451             /* Analytical PME correction */
452             zeta2            = _mm_mul_ps(beta2,rsq02);
453             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
454             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
455             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
456             felec            = _mm_mul_ps(qq02,felec);
457             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
458             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
459             velec            = _mm_mul_ps(qq02,velec);
460
461             d                = _mm_sub_ps(r02,rswitch);
462             d                = _mm_max_ps(d,_mm_setzero_ps());
463             d2               = _mm_mul_ps(d,d);
464             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
465
466             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
467
468             /* Evaluate switch function */
469             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
470             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
471             velec            = _mm_mul_ps(velec,sw);
472             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
473
474             /* Update potential sum for this i atom from the interaction with this j atom. */
475             velec            = _mm_and_ps(velec,cutoff_mask);
476             velecsum         = _mm_add_ps(velecsum,velec);
477
478             fscal            = felec;
479
480             fscal            = _mm_and_ps(fscal,cutoff_mask);
481
482              /* Update vectorial force */
483             fix0             = _mm_macc_ps(dx02,fscal,fix0);
484             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
485             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
486
487             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
488             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
489             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
490
491             }
492
493             /**************************
494              * CALCULATE INTERACTIONS *
495              **************************/
496
497             if (gmx_mm_any_lt(rsq10,rcutoff2))
498             {
499
500             r10              = _mm_mul_ps(rsq10,rinv10);
501
502             /* EWALD ELECTROSTATICS */
503
504             /* Analytical PME correction */
505             zeta2            = _mm_mul_ps(beta2,rsq10);
506             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
507             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
508             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
509             felec            = _mm_mul_ps(qq10,felec);
510             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
511             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
512             velec            = _mm_mul_ps(qq10,velec);
513
514             d                = _mm_sub_ps(r10,rswitch);
515             d                = _mm_max_ps(d,_mm_setzero_ps());
516             d2               = _mm_mul_ps(d,d);
517             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
518
519             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
520
521             /* Evaluate switch function */
522             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
523             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
524             velec            = _mm_mul_ps(velec,sw);
525             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
526
527             /* Update potential sum for this i atom from the interaction with this j atom. */
528             velec            = _mm_and_ps(velec,cutoff_mask);
529             velecsum         = _mm_add_ps(velecsum,velec);
530
531             fscal            = felec;
532
533             fscal            = _mm_and_ps(fscal,cutoff_mask);
534
535              /* Update vectorial force */
536             fix1             = _mm_macc_ps(dx10,fscal,fix1);
537             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
538             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
539
540             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
541             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
542             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
543
544             }
545
546             /**************************
547              * CALCULATE INTERACTIONS *
548              **************************/
549
550             if (gmx_mm_any_lt(rsq11,rcutoff2))
551             {
552
553             r11              = _mm_mul_ps(rsq11,rinv11);
554
555             /* EWALD ELECTROSTATICS */
556
557             /* Analytical PME correction */
558             zeta2            = _mm_mul_ps(beta2,rsq11);
559             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
560             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
561             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
562             felec            = _mm_mul_ps(qq11,felec);
563             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
564             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
565             velec            = _mm_mul_ps(qq11,velec);
566
567             d                = _mm_sub_ps(r11,rswitch);
568             d                = _mm_max_ps(d,_mm_setzero_ps());
569             d2               = _mm_mul_ps(d,d);
570             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
571
572             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
573
574             /* Evaluate switch function */
575             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
576             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
577             velec            = _mm_mul_ps(velec,sw);
578             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
579
580             /* Update potential sum for this i atom from the interaction with this j atom. */
581             velec            = _mm_and_ps(velec,cutoff_mask);
582             velecsum         = _mm_add_ps(velecsum,velec);
583
584             fscal            = felec;
585
586             fscal            = _mm_and_ps(fscal,cutoff_mask);
587
588              /* Update vectorial force */
589             fix1             = _mm_macc_ps(dx11,fscal,fix1);
590             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
591             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
592
593             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
594             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
595             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
596
597             }
598
599             /**************************
600              * CALCULATE INTERACTIONS *
601              **************************/
602
603             if (gmx_mm_any_lt(rsq12,rcutoff2))
604             {
605
606             r12              = _mm_mul_ps(rsq12,rinv12);
607
608             /* EWALD ELECTROSTATICS */
609
610             /* Analytical PME correction */
611             zeta2            = _mm_mul_ps(beta2,rsq12);
612             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
613             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
614             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
615             felec            = _mm_mul_ps(qq12,felec);
616             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
617             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
618             velec            = _mm_mul_ps(qq12,velec);
619
620             d                = _mm_sub_ps(r12,rswitch);
621             d                = _mm_max_ps(d,_mm_setzero_ps());
622             d2               = _mm_mul_ps(d,d);
623             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
624
625             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
626
627             /* Evaluate switch function */
628             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
629             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
630             velec            = _mm_mul_ps(velec,sw);
631             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
632
633             /* Update potential sum for this i atom from the interaction with this j atom. */
634             velec            = _mm_and_ps(velec,cutoff_mask);
635             velecsum         = _mm_add_ps(velecsum,velec);
636
637             fscal            = felec;
638
639             fscal            = _mm_and_ps(fscal,cutoff_mask);
640
641              /* Update vectorial force */
642             fix1             = _mm_macc_ps(dx12,fscal,fix1);
643             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
644             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
645
646             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
647             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
648             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
649
650             }
651
652             /**************************
653              * CALCULATE INTERACTIONS *
654              **************************/
655
656             if (gmx_mm_any_lt(rsq20,rcutoff2))
657             {
658
659             r20              = _mm_mul_ps(rsq20,rinv20);
660
661             /* EWALD ELECTROSTATICS */
662
663             /* Analytical PME correction */
664             zeta2            = _mm_mul_ps(beta2,rsq20);
665             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
666             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
667             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
668             felec            = _mm_mul_ps(qq20,felec);
669             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
670             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
671             velec            = _mm_mul_ps(qq20,velec);
672
673             d                = _mm_sub_ps(r20,rswitch);
674             d                = _mm_max_ps(d,_mm_setzero_ps());
675             d2               = _mm_mul_ps(d,d);
676             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
677
678             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
679
680             /* Evaluate switch function */
681             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
682             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
683             velec            = _mm_mul_ps(velec,sw);
684             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
685
686             /* Update potential sum for this i atom from the interaction with this j atom. */
687             velec            = _mm_and_ps(velec,cutoff_mask);
688             velecsum         = _mm_add_ps(velecsum,velec);
689
690             fscal            = felec;
691
692             fscal            = _mm_and_ps(fscal,cutoff_mask);
693
694              /* Update vectorial force */
695             fix2             = _mm_macc_ps(dx20,fscal,fix2);
696             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
697             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
698
699             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
700             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
701             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
702
703             }
704
705             /**************************
706              * CALCULATE INTERACTIONS *
707              **************************/
708
709             if (gmx_mm_any_lt(rsq21,rcutoff2))
710             {
711
712             r21              = _mm_mul_ps(rsq21,rinv21);
713
714             /* EWALD ELECTROSTATICS */
715
716             /* Analytical PME correction */
717             zeta2            = _mm_mul_ps(beta2,rsq21);
718             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
719             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
720             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
721             felec            = _mm_mul_ps(qq21,felec);
722             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
723             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
724             velec            = _mm_mul_ps(qq21,velec);
725
726             d                = _mm_sub_ps(r21,rswitch);
727             d                = _mm_max_ps(d,_mm_setzero_ps());
728             d2               = _mm_mul_ps(d,d);
729             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
730
731             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
732
733             /* Evaluate switch function */
734             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
735             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
736             velec            = _mm_mul_ps(velec,sw);
737             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
738
739             /* Update potential sum for this i atom from the interaction with this j atom. */
740             velec            = _mm_and_ps(velec,cutoff_mask);
741             velecsum         = _mm_add_ps(velecsum,velec);
742
743             fscal            = felec;
744
745             fscal            = _mm_and_ps(fscal,cutoff_mask);
746
747              /* Update vectorial force */
748             fix2             = _mm_macc_ps(dx21,fscal,fix2);
749             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
750             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
751
752             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
753             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
754             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
755
756             }
757
758             /**************************
759              * CALCULATE INTERACTIONS *
760              **************************/
761
762             if (gmx_mm_any_lt(rsq22,rcutoff2))
763             {
764
765             r22              = _mm_mul_ps(rsq22,rinv22);
766
767             /* EWALD ELECTROSTATICS */
768
769             /* Analytical PME correction */
770             zeta2            = _mm_mul_ps(beta2,rsq22);
771             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
772             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
773             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
774             felec            = _mm_mul_ps(qq22,felec);
775             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
776             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
777             velec            = _mm_mul_ps(qq22,velec);
778
779             d                = _mm_sub_ps(r22,rswitch);
780             d                = _mm_max_ps(d,_mm_setzero_ps());
781             d2               = _mm_mul_ps(d,d);
782             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
783
784             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
785
786             /* Evaluate switch function */
787             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
788             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
789             velec            = _mm_mul_ps(velec,sw);
790             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
791
792             /* Update potential sum for this i atom from the interaction with this j atom. */
793             velec            = _mm_and_ps(velec,cutoff_mask);
794             velecsum         = _mm_add_ps(velecsum,velec);
795
796             fscal            = felec;
797
798             fscal            = _mm_and_ps(fscal,cutoff_mask);
799
800              /* Update vectorial force */
801             fix2             = _mm_macc_ps(dx22,fscal,fix2);
802             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
803             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
804
805             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
806             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
807             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
808
809             }
810
811             fjptrA             = f+j_coord_offsetA;
812             fjptrB             = f+j_coord_offsetB;
813             fjptrC             = f+j_coord_offsetC;
814             fjptrD             = f+j_coord_offsetD;
815
816             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
817                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
818
819             /* Inner loop uses 495 flops */
820         }
821
822         if(jidx<j_index_end)
823         {
824
825             /* Get j neighbor index, and coordinate index */
826             jnrlistA         = jjnr[jidx];
827             jnrlistB         = jjnr[jidx+1];
828             jnrlistC         = jjnr[jidx+2];
829             jnrlistD         = jjnr[jidx+3];
830             /* Sign of each element will be negative for non-real atoms.
831              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
832              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
833              */
834             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
835             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
836             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
837             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
838             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
839             j_coord_offsetA  = DIM*jnrA;
840             j_coord_offsetB  = DIM*jnrB;
841             j_coord_offsetC  = DIM*jnrC;
842             j_coord_offsetD  = DIM*jnrD;
843
844             /* load j atom coordinates */
845             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
846                                               x+j_coord_offsetC,x+j_coord_offsetD,
847                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
848
849             /* Calculate displacement vector */
850             dx00             = _mm_sub_ps(ix0,jx0);
851             dy00             = _mm_sub_ps(iy0,jy0);
852             dz00             = _mm_sub_ps(iz0,jz0);
853             dx01             = _mm_sub_ps(ix0,jx1);
854             dy01             = _mm_sub_ps(iy0,jy1);
855             dz01             = _mm_sub_ps(iz0,jz1);
856             dx02             = _mm_sub_ps(ix0,jx2);
857             dy02             = _mm_sub_ps(iy0,jy2);
858             dz02             = _mm_sub_ps(iz0,jz2);
859             dx10             = _mm_sub_ps(ix1,jx0);
860             dy10             = _mm_sub_ps(iy1,jy0);
861             dz10             = _mm_sub_ps(iz1,jz0);
862             dx11             = _mm_sub_ps(ix1,jx1);
863             dy11             = _mm_sub_ps(iy1,jy1);
864             dz11             = _mm_sub_ps(iz1,jz1);
865             dx12             = _mm_sub_ps(ix1,jx2);
866             dy12             = _mm_sub_ps(iy1,jy2);
867             dz12             = _mm_sub_ps(iz1,jz2);
868             dx20             = _mm_sub_ps(ix2,jx0);
869             dy20             = _mm_sub_ps(iy2,jy0);
870             dz20             = _mm_sub_ps(iz2,jz0);
871             dx21             = _mm_sub_ps(ix2,jx1);
872             dy21             = _mm_sub_ps(iy2,jy1);
873             dz21             = _mm_sub_ps(iz2,jz1);
874             dx22             = _mm_sub_ps(ix2,jx2);
875             dy22             = _mm_sub_ps(iy2,jy2);
876             dz22             = _mm_sub_ps(iz2,jz2);
877
878             /* Calculate squared distance and things based on it */
879             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
880             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
881             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
882             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
883             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
884             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
885             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
886             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
887             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
888
889             rinv00           = avx128fma_invsqrt_f(rsq00);
890             rinv01           = avx128fma_invsqrt_f(rsq01);
891             rinv02           = avx128fma_invsqrt_f(rsq02);
892             rinv10           = avx128fma_invsqrt_f(rsq10);
893             rinv11           = avx128fma_invsqrt_f(rsq11);
894             rinv12           = avx128fma_invsqrt_f(rsq12);
895             rinv20           = avx128fma_invsqrt_f(rsq20);
896             rinv21           = avx128fma_invsqrt_f(rsq21);
897             rinv22           = avx128fma_invsqrt_f(rsq22);
898
899             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
900             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
901             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
902             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
903             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
904             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
905             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
906             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
907             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
908
909             fjx0             = _mm_setzero_ps();
910             fjy0             = _mm_setzero_ps();
911             fjz0             = _mm_setzero_ps();
912             fjx1             = _mm_setzero_ps();
913             fjy1             = _mm_setzero_ps();
914             fjz1             = _mm_setzero_ps();
915             fjx2             = _mm_setzero_ps();
916             fjy2             = _mm_setzero_ps();
917             fjz2             = _mm_setzero_ps();
918
919             /**************************
920              * CALCULATE INTERACTIONS *
921              **************************/
922
923             if (gmx_mm_any_lt(rsq00,rcutoff2))
924             {
925
926             r00              = _mm_mul_ps(rsq00,rinv00);
927             r00              = _mm_andnot_ps(dummy_mask,r00);
928
929             /* EWALD ELECTROSTATICS */
930
931             /* Analytical PME correction */
932             zeta2            = _mm_mul_ps(beta2,rsq00);
933             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
934             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
935             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
936             felec            = _mm_mul_ps(qq00,felec);
937             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
938             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
939             velec            = _mm_mul_ps(qq00,velec);
940
941             /* LENNARD-JONES DISPERSION/REPULSION */
942
943             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
944             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
945             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
946             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
947             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
948
949             d                = _mm_sub_ps(r00,rswitch);
950             d                = _mm_max_ps(d,_mm_setzero_ps());
951             d2               = _mm_mul_ps(d,d);
952             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
953
954             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
955
956             /* Evaluate switch function */
957             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
958             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
959             fvdw             = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
960             velec            = _mm_mul_ps(velec,sw);
961             vvdw             = _mm_mul_ps(vvdw,sw);
962             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
963
964             /* Update potential sum for this i atom from the interaction with this j atom. */
965             velec            = _mm_and_ps(velec,cutoff_mask);
966             velec            = _mm_andnot_ps(dummy_mask,velec);
967             velecsum         = _mm_add_ps(velecsum,velec);
968             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
969             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
970             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
971
972             fscal            = _mm_add_ps(felec,fvdw);
973
974             fscal            = _mm_and_ps(fscal,cutoff_mask);
975
976             fscal            = _mm_andnot_ps(dummy_mask,fscal);
977
978              /* Update vectorial force */
979             fix0             = _mm_macc_ps(dx00,fscal,fix0);
980             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
981             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
982
983             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
984             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
985             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
986
987             }
988
989             /**************************
990              * CALCULATE INTERACTIONS *
991              **************************/
992
993             if (gmx_mm_any_lt(rsq01,rcutoff2))
994             {
995
996             r01              = _mm_mul_ps(rsq01,rinv01);
997             r01              = _mm_andnot_ps(dummy_mask,r01);
998
999             /* EWALD ELECTROSTATICS */
1000
1001             /* Analytical PME correction */
1002             zeta2            = _mm_mul_ps(beta2,rsq01);
1003             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
1004             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1005             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1006             felec            = _mm_mul_ps(qq01,felec);
1007             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
1008             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
1009             velec            = _mm_mul_ps(qq01,velec);
1010
1011             d                = _mm_sub_ps(r01,rswitch);
1012             d                = _mm_max_ps(d,_mm_setzero_ps());
1013             d2               = _mm_mul_ps(d,d);
1014             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1015
1016             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1017
1018             /* Evaluate switch function */
1019             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1020             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1021             velec            = _mm_mul_ps(velec,sw);
1022             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1023
1024             /* Update potential sum for this i atom from the interaction with this j atom. */
1025             velec            = _mm_and_ps(velec,cutoff_mask);
1026             velec            = _mm_andnot_ps(dummy_mask,velec);
1027             velecsum         = _mm_add_ps(velecsum,velec);
1028
1029             fscal            = felec;
1030
1031             fscal            = _mm_and_ps(fscal,cutoff_mask);
1032
1033             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1034
1035              /* Update vectorial force */
1036             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1037             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1038             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1039
1040             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1041             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1042             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1043
1044             }
1045
1046             /**************************
1047              * CALCULATE INTERACTIONS *
1048              **************************/
1049
1050             if (gmx_mm_any_lt(rsq02,rcutoff2))
1051             {
1052
1053             r02              = _mm_mul_ps(rsq02,rinv02);
1054             r02              = _mm_andnot_ps(dummy_mask,r02);
1055
1056             /* EWALD ELECTROSTATICS */
1057
1058             /* Analytical PME correction */
1059             zeta2            = _mm_mul_ps(beta2,rsq02);
1060             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
1061             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1062             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1063             felec            = _mm_mul_ps(qq02,felec);
1064             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
1065             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
1066             velec            = _mm_mul_ps(qq02,velec);
1067
1068             d                = _mm_sub_ps(r02,rswitch);
1069             d                = _mm_max_ps(d,_mm_setzero_ps());
1070             d2               = _mm_mul_ps(d,d);
1071             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1072
1073             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1074
1075             /* Evaluate switch function */
1076             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1077             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1078             velec            = _mm_mul_ps(velec,sw);
1079             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1080
1081             /* Update potential sum for this i atom from the interaction with this j atom. */
1082             velec            = _mm_and_ps(velec,cutoff_mask);
1083             velec            = _mm_andnot_ps(dummy_mask,velec);
1084             velecsum         = _mm_add_ps(velecsum,velec);
1085
1086             fscal            = felec;
1087
1088             fscal            = _mm_and_ps(fscal,cutoff_mask);
1089
1090             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1091
1092              /* Update vectorial force */
1093             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1094             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1095             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1096
1097             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1098             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1099             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1100
1101             }
1102
1103             /**************************
1104              * CALCULATE INTERACTIONS *
1105              **************************/
1106
1107             if (gmx_mm_any_lt(rsq10,rcutoff2))
1108             {
1109
1110             r10              = _mm_mul_ps(rsq10,rinv10);
1111             r10              = _mm_andnot_ps(dummy_mask,r10);
1112
1113             /* EWALD ELECTROSTATICS */
1114
1115             /* Analytical PME correction */
1116             zeta2            = _mm_mul_ps(beta2,rsq10);
1117             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1118             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1119             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1120             felec            = _mm_mul_ps(qq10,felec);
1121             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
1122             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
1123             velec            = _mm_mul_ps(qq10,velec);
1124
1125             d                = _mm_sub_ps(r10,rswitch);
1126             d                = _mm_max_ps(d,_mm_setzero_ps());
1127             d2               = _mm_mul_ps(d,d);
1128             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1129
1130             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1131
1132             /* Evaluate switch function */
1133             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1134             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1135             velec            = _mm_mul_ps(velec,sw);
1136             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1137
1138             /* Update potential sum for this i atom from the interaction with this j atom. */
1139             velec            = _mm_and_ps(velec,cutoff_mask);
1140             velec            = _mm_andnot_ps(dummy_mask,velec);
1141             velecsum         = _mm_add_ps(velecsum,velec);
1142
1143             fscal            = felec;
1144
1145             fscal            = _mm_and_ps(fscal,cutoff_mask);
1146
1147             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1148
1149              /* Update vectorial force */
1150             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1151             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1152             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1153
1154             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1155             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1156             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1157
1158             }
1159
1160             /**************************
1161              * CALCULATE INTERACTIONS *
1162              **************************/
1163
1164             if (gmx_mm_any_lt(rsq11,rcutoff2))
1165             {
1166
1167             r11              = _mm_mul_ps(rsq11,rinv11);
1168             r11              = _mm_andnot_ps(dummy_mask,r11);
1169
1170             /* EWALD ELECTROSTATICS */
1171
1172             /* Analytical PME correction */
1173             zeta2            = _mm_mul_ps(beta2,rsq11);
1174             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1175             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1176             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1177             felec            = _mm_mul_ps(qq11,felec);
1178             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
1179             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1180             velec            = _mm_mul_ps(qq11,velec);
1181
1182             d                = _mm_sub_ps(r11,rswitch);
1183             d                = _mm_max_ps(d,_mm_setzero_ps());
1184             d2               = _mm_mul_ps(d,d);
1185             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1186
1187             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1188
1189             /* Evaluate switch function */
1190             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1191             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1192             velec            = _mm_mul_ps(velec,sw);
1193             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1194
1195             /* Update potential sum for this i atom from the interaction with this j atom. */
1196             velec            = _mm_and_ps(velec,cutoff_mask);
1197             velec            = _mm_andnot_ps(dummy_mask,velec);
1198             velecsum         = _mm_add_ps(velecsum,velec);
1199
1200             fscal            = felec;
1201
1202             fscal            = _mm_and_ps(fscal,cutoff_mask);
1203
1204             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1205
1206              /* Update vectorial force */
1207             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1208             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1209             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1210
1211             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1212             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1213             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1214
1215             }
1216
1217             /**************************
1218              * CALCULATE INTERACTIONS *
1219              **************************/
1220
1221             if (gmx_mm_any_lt(rsq12,rcutoff2))
1222             {
1223
1224             r12              = _mm_mul_ps(rsq12,rinv12);
1225             r12              = _mm_andnot_ps(dummy_mask,r12);
1226
1227             /* EWALD ELECTROSTATICS */
1228
1229             /* Analytical PME correction */
1230             zeta2            = _mm_mul_ps(beta2,rsq12);
1231             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1232             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1233             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1234             felec            = _mm_mul_ps(qq12,felec);
1235             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
1236             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
1237             velec            = _mm_mul_ps(qq12,velec);
1238
1239             d                = _mm_sub_ps(r12,rswitch);
1240             d                = _mm_max_ps(d,_mm_setzero_ps());
1241             d2               = _mm_mul_ps(d,d);
1242             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1243
1244             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1245
1246             /* Evaluate switch function */
1247             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1248             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1249             velec            = _mm_mul_ps(velec,sw);
1250             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1251
1252             /* Update potential sum for this i atom from the interaction with this j atom. */
1253             velec            = _mm_and_ps(velec,cutoff_mask);
1254             velec            = _mm_andnot_ps(dummy_mask,velec);
1255             velecsum         = _mm_add_ps(velecsum,velec);
1256
1257             fscal            = felec;
1258
1259             fscal            = _mm_and_ps(fscal,cutoff_mask);
1260
1261             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1262
1263              /* Update vectorial force */
1264             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1265             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1266             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1267
1268             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1269             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1270             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1271
1272             }
1273
1274             /**************************
1275              * CALCULATE INTERACTIONS *
1276              **************************/
1277
1278             if (gmx_mm_any_lt(rsq20,rcutoff2))
1279             {
1280
1281             r20              = _mm_mul_ps(rsq20,rinv20);
1282             r20              = _mm_andnot_ps(dummy_mask,r20);
1283
1284             /* EWALD ELECTROSTATICS */
1285
1286             /* Analytical PME correction */
1287             zeta2            = _mm_mul_ps(beta2,rsq20);
1288             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
1289             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1290             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1291             felec            = _mm_mul_ps(qq20,felec);
1292             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
1293             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
1294             velec            = _mm_mul_ps(qq20,velec);
1295
1296             d                = _mm_sub_ps(r20,rswitch);
1297             d                = _mm_max_ps(d,_mm_setzero_ps());
1298             d2               = _mm_mul_ps(d,d);
1299             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1300
1301             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1302
1303             /* Evaluate switch function */
1304             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1305             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1306             velec            = _mm_mul_ps(velec,sw);
1307             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1308
1309             /* Update potential sum for this i atom from the interaction with this j atom. */
1310             velec            = _mm_and_ps(velec,cutoff_mask);
1311             velec            = _mm_andnot_ps(dummy_mask,velec);
1312             velecsum         = _mm_add_ps(velecsum,velec);
1313
1314             fscal            = felec;
1315
1316             fscal            = _mm_and_ps(fscal,cutoff_mask);
1317
1318             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1319
1320              /* Update vectorial force */
1321             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1322             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1323             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1324
1325             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1326             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1327             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1328
1329             }
1330
1331             /**************************
1332              * CALCULATE INTERACTIONS *
1333              **************************/
1334
1335             if (gmx_mm_any_lt(rsq21,rcutoff2))
1336             {
1337
1338             r21              = _mm_mul_ps(rsq21,rinv21);
1339             r21              = _mm_andnot_ps(dummy_mask,r21);
1340
1341             /* EWALD ELECTROSTATICS */
1342
1343             /* Analytical PME correction */
1344             zeta2            = _mm_mul_ps(beta2,rsq21);
1345             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1346             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1347             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1348             felec            = _mm_mul_ps(qq21,felec);
1349             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
1350             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
1351             velec            = _mm_mul_ps(qq21,velec);
1352
1353             d                = _mm_sub_ps(r21,rswitch);
1354             d                = _mm_max_ps(d,_mm_setzero_ps());
1355             d2               = _mm_mul_ps(d,d);
1356             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1357
1358             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1359
1360             /* Evaluate switch function */
1361             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1362             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1363             velec            = _mm_mul_ps(velec,sw);
1364             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1365
1366             /* Update potential sum for this i atom from the interaction with this j atom. */
1367             velec            = _mm_and_ps(velec,cutoff_mask);
1368             velec            = _mm_andnot_ps(dummy_mask,velec);
1369             velecsum         = _mm_add_ps(velecsum,velec);
1370
1371             fscal            = felec;
1372
1373             fscal            = _mm_and_ps(fscal,cutoff_mask);
1374
1375             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1376
1377              /* Update vectorial force */
1378             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1379             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1380             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1381
1382             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1383             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1384             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1385
1386             }
1387
1388             /**************************
1389              * CALCULATE INTERACTIONS *
1390              **************************/
1391
1392             if (gmx_mm_any_lt(rsq22,rcutoff2))
1393             {
1394
1395             r22              = _mm_mul_ps(rsq22,rinv22);
1396             r22              = _mm_andnot_ps(dummy_mask,r22);
1397
1398             /* EWALD ELECTROSTATICS */
1399
1400             /* Analytical PME correction */
1401             zeta2            = _mm_mul_ps(beta2,rsq22);
1402             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1403             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1404             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1405             felec            = _mm_mul_ps(qq22,felec);
1406             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
1407             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1408             velec            = _mm_mul_ps(qq22,velec);
1409
1410             d                = _mm_sub_ps(r22,rswitch);
1411             d                = _mm_max_ps(d,_mm_setzero_ps());
1412             d2               = _mm_mul_ps(d,d);
1413             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1414
1415             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1416
1417             /* Evaluate switch function */
1418             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1419             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1420             velec            = _mm_mul_ps(velec,sw);
1421             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1422
1423             /* Update potential sum for this i atom from the interaction with this j atom. */
1424             velec            = _mm_and_ps(velec,cutoff_mask);
1425             velec            = _mm_andnot_ps(dummy_mask,velec);
1426             velecsum         = _mm_add_ps(velecsum,velec);
1427
1428             fscal            = felec;
1429
1430             fscal            = _mm_and_ps(fscal,cutoff_mask);
1431
1432             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1433
1434              /* Update vectorial force */
1435             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1436             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1437             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1438
1439             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1440             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1441             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1442
1443             }
1444
1445             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1446             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1447             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1448             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1449
1450             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1451                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1452
1453             /* Inner loop uses 504 flops */
1454         }
1455
1456         /* End of innermost loop */
1457
1458         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1459                                               f+i_coord_offset,fshift+i_shift_offset);
1460
1461         ggid                        = gid[iidx];
1462         /* Update potential energies */
1463         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1464         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1465
1466         /* Increment number of inner iterations */
1467         inneriter                  += j_index_end - j_index_start;
1468
1469         /* Outer loop uses 20 flops */
1470     }
1471
1472     /* Increment number of outer iterations */
1473     outeriter        += nri;
1474
1475     /* Update outer/inner flops */
1476
1477     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*504);
1478 }
1479 /*
1480  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_single
1481  * Electrostatics interaction: Ewald
1482  * VdW interaction:            LennardJones
1483  * Geometry:                   Water3-Water3
1484  * Calculate force/pot:        Force
1485  */
1486 void
1487 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_single
1488                     (t_nblist                    * gmx_restrict       nlist,
1489                      rvec                        * gmx_restrict          xx,
1490                      rvec                        * gmx_restrict          ff,
1491                      struct t_forcerec           * gmx_restrict          fr,
1492                      t_mdatoms                   * gmx_restrict     mdatoms,
1493                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1494                      t_nrnb                      * gmx_restrict        nrnb)
1495 {
1496     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1497      * just 0 for non-waters.
1498      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1499      * jnr indices corresponding to data put in the four positions in the SIMD register.
1500      */
1501     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1502     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1503     int              jnrA,jnrB,jnrC,jnrD;
1504     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1505     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1506     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1507     real             rcutoff_scalar;
1508     real             *shiftvec,*fshift,*x,*f;
1509     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1510     real             scratch[4*DIM];
1511     __m128           fscal,rcutoff,rcutoff2,jidxall;
1512     int              vdwioffset0;
1513     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1514     int              vdwioffset1;
1515     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1516     int              vdwioffset2;
1517     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1518     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1519     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1520     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1521     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1522     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1523     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1524     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1525     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1526     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1527     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1528     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1529     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1530     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1531     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1532     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1533     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1534     real             *charge;
1535     int              nvdwtype;
1536     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1537     int              *vdwtype;
1538     real             *vdwparam;
1539     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1540     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1541     __m128i          ewitab;
1542     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1543     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1544     real             *ewtab;
1545     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1546     real             rswitch_scalar,d_scalar;
1547     __m128           dummy_mask,cutoff_mask;
1548     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1549     __m128           one     = _mm_set1_ps(1.0);
1550     __m128           two     = _mm_set1_ps(2.0);
1551     x                = xx[0];
1552     f                = ff[0];
1553
1554     nri              = nlist->nri;
1555     iinr             = nlist->iinr;
1556     jindex           = nlist->jindex;
1557     jjnr             = nlist->jjnr;
1558     shiftidx         = nlist->shift;
1559     gid              = nlist->gid;
1560     shiftvec         = fr->shift_vec[0];
1561     fshift           = fr->fshift[0];
1562     facel            = _mm_set1_ps(fr->ic->epsfac);
1563     charge           = mdatoms->chargeA;
1564     nvdwtype         = fr->ntype;
1565     vdwparam         = fr->nbfp;
1566     vdwtype          = mdatoms->typeA;
1567
1568     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1569     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1570     beta2            = _mm_mul_ps(beta,beta);
1571     beta3            = _mm_mul_ps(beta,beta2);
1572     ewtab            = fr->ic->tabq_coul_FDV0;
1573     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1574     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1575
1576     /* Setup water-specific parameters */
1577     inr              = nlist->iinr[0];
1578     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1579     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1580     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1581     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1582
1583     jq0              = _mm_set1_ps(charge[inr+0]);
1584     jq1              = _mm_set1_ps(charge[inr+1]);
1585     jq2              = _mm_set1_ps(charge[inr+2]);
1586     vdwjidx0A        = 2*vdwtype[inr+0];
1587     qq00             = _mm_mul_ps(iq0,jq0);
1588     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1589     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1590     qq01             = _mm_mul_ps(iq0,jq1);
1591     qq02             = _mm_mul_ps(iq0,jq2);
1592     qq10             = _mm_mul_ps(iq1,jq0);
1593     qq11             = _mm_mul_ps(iq1,jq1);
1594     qq12             = _mm_mul_ps(iq1,jq2);
1595     qq20             = _mm_mul_ps(iq2,jq0);
1596     qq21             = _mm_mul_ps(iq2,jq1);
1597     qq22             = _mm_mul_ps(iq2,jq2);
1598
1599     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1600     rcutoff_scalar   = fr->ic->rcoulomb;
1601     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1602     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1603
1604     rswitch_scalar   = fr->ic->rcoulomb_switch;
1605     rswitch          = _mm_set1_ps(rswitch_scalar);
1606     /* Setup switch parameters */
1607     d_scalar         = rcutoff_scalar-rswitch_scalar;
1608     d                = _mm_set1_ps(d_scalar);
1609     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1610     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1611     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1612     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1613     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1614     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1615
1616     /* Avoid stupid compiler warnings */
1617     jnrA = jnrB = jnrC = jnrD = 0;
1618     j_coord_offsetA = 0;
1619     j_coord_offsetB = 0;
1620     j_coord_offsetC = 0;
1621     j_coord_offsetD = 0;
1622
1623     outeriter        = 0;
1624     inneriter        = 0;
1625
1626     for(iidx=0;iidx<4*DIM;iidx++)
1627     {
1628         scratch[iidx] = 0.0;
1629     }
1630
1631     /* Start outer loop over neighborlists */
1632     for(iidx=0; iidx<nri; iidx++)
1633     {
1634         /* Load shift vector for this list */
1635         i_shift_offset   = DIM*shiftidx[iidx];
1636
1637         /* Load limits for loop over neighbors */
1638         j_index_start    = jindex[iidx];
1639         j_index_end      = jindex[iidx+1];
1640
1641         /* Get outer coordinate index */
1642         inr              = iinr[iidx];
1643         i_coord_offset   = DIM*inr;
1644
1645         /* Load i particle coords and add shift vector */
1646         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1647                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1648
1649         fix0             = _mm_setzero_ps();
1650         fiy0             = _mm_setzero_ps();
1651         fiz0             = _mm_setzero_ps();
1652         fix1             = _mm_setzero_ps();
1653         fiy1             = _mm_setzero_ps();
1654         fiz1             = _mm_setzero_ps();
1655         fix2             = _mm_setzero_ps();
1656         fiy2             = _mm_setzero_ps();
1657         fiz2             = _mm_setzero_ps();
1658
1659         /* Start inner kernel loop */
1660         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1661         {
1662
1663             /* Get j neighbor index, and coordinate index */
1664             jnrA             = jjnr[jidx];
1665             jnrB             = jjnr[jidx+1];
1666             jnrC             = jjnr[jidx+2];
1667             jnrD             = jjnr[jidx+3];
1668             j_coord_offsetA  = DIM*jnrA;
1669             j_coord_offsetB  = DIM*jnrB;
1670             j_coord_offsetC  = DIM*jnrC;
1671             j_coord_offsetD  = DIM*jnrD;
1672
1673             /* load j atom coordinates */
1674             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1675                                               x+j_coord_offsetC,x+j_coord_offsetD,
1676                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1677
1678             /* Calculate displacement vector */
1679             dx00             = _mm_sub_ps(ix0,jx0);
1680             dy00             = _mm_sub_ps(iy0,jy0);
1681             dz00             = _mm_sub_ps(iz0,jz0);
1682             dx01             = _mm_sub_ps(ix0,jx1);
1683             dy01             = _mm_sub_ps(iy0,jy1);
1684             dz01             = _mm_sub_ps(iz0,jz1);
1685             dx02             = _mm_sub_ps(ix0,jx2);
1686             dy02             = _mm_sub_ps(iy0,jy2);
1687             dz02             = _mm_sub_ps(iz0,jz2);
1688             dx10             = _mm_sub_ps(ix1,jx0);
1689             dy10             = _mm_sub_ps(iy1,jy0);
1690             dz10             = _mm_sub_ps(iz1,jz0);
1691             dx11             = _mm_sub_ps(ix1,jx1);
1692             dy11             = _mm_sub_ps(iy1,jy1);
1693             dz11             = _mm_sub_ps(iz1,jz1);
1694             dx12             = _mm_sub_ps(ix1,jx2);
1695             dy12             = _mm_sub_ps(iy1,jy2);
1696             dz12             = _mm_sub_ps(iz1,jz2);
1697             dx20             = _mm_sub_ps(ix2,jx0);
1698             dy20             = _mm_sub_ps(iy2,jy0);
1699             dz20             = _mm_sub_ps(iz2,jz0);
1700             dx21             = _mm_sub_ps(ix2,jx1);
1701             dy21             = _mm_sub_ps(iy2,jy1);
1702             dz21             = _mm_sub_ps(iz2,jz1);
1703             dx22             = _mm_sub_ps(ix2,jx2);
1704             dy22             = _mm_sub_ps(iy2,jy2);
1705             dz22             = _mm_sub_ps(iz2,jz2);
1706
1707             /* Calculate squared distance and things based on it */
1708             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1709             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1710             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1711             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1712             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1713             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1714             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1715             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1716             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1717
1718             rinv00           = avx128fma_invsqrt_f(rsq00);
1719             rinv01           = avx128fma_invsqrt_f(rsq01);
1720             rinv02           = avx128fma_invsqrt_f(rsq02);
1721             rinv10           = avx128fma_invsqrt_f(rsq10);
1722             rinv11           = avx128fma_invsqrt_f(rsq11);
1723             rinv12           = avx128fma_invsqrt_f(rsq12);
1724             rinv20           = avx128fma_invsqrt_f(rsq20);
1725             rinv21           = avx128fma_invsqrt_f(rsq21);
1726             rinv22           = avx128fma_invsqrt_f(rsq22);
1727
1728             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1729             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1730             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1731             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1732             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1733             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1734             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1735             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1736             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1737
1738             fjx0             = _mm_setzero_ps();
1739             fjy0             = _mm_setzero_ps();
1740             fjz0             = _mm_setzero_ps();
1741             fjx1             = _mm_setzero_ps();
1742             fjy1             = _mm_setzero_ps();
1743             fjz1             = _mm_setzero_ps();
1744             fjx2             = _mm_setzero_ps();
1745             fjy2             = _mm_setzero_ps();
1746             fjz2             = _mm_setzero_ps();
1747
1748             /**************************
1749              * CALCULATE INTERACTIONS *
1750              **************************/
1751
1752             if (gmx_mm_any_lt(rsq00,rcutoff2))
1753             {
1754
1755             r00              = _mm_mul_ps(rsq00,rinv00);
1756
1757             /* EWALD ELECTROSTATICS */
1758
1759             /* Analytical PME correction */
1760             zeta2            = _mm_mul_ps(beta2,rsq00);
1761             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
1762             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1763             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1764             felec            = _mm_mul_ps(qq00,felec);
1765             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
1766             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
1767             velec            = _mm_mul_ps(qq00,velec);
1768
1769             /* LENNARD-JONES DISPERSION/REPULSION */
1770
1771             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1772             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1773             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1774             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
1775             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1776
1777             d                = _mm_sub_ps(r00,rswitch);
1778             d                = _mm_max_ps(d,_mm_setzero_ps());
1779             d2               = _mm_mul_ps(d,d);
1780             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1781
1782             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1783
1784             /* Evaluate switch function */
1785             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1786             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1787             fvdw             = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1788             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1789
1790             fscal            = _mm_add_ps(felec,fvdw);
1791
1792             fscal            = _mm_and_ps(fscal,cutoff_mask);
1793
1794              /* Update vectorial force */
1795             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1796             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1797             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1798
1799             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1800             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1801             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1802
1803             }
1804
1805             /**************************
1806              * CALCULATE INTERACTIONS *
1807              **************************/
1808
1809             if (gmx_mm_any_lt(rsq01,rcutoff2))
1810             {
1811
1812             r01              = _mm_mul_ps(rsq01,rinv01);
1813
1814             /* EWALD ELECTROSTATICS */
1815
1816             /* Analytical PME correction */
1817             zeta2            = _mm_mul_ps(beta2,rsq01);
1818             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
1819             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1820             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1821             felec            = _mm_mul_ps(qq01,felec);
1822             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
1823             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
1824             velec            = _mm_mul_ps(qq01,velec);
1825
1826             d                = _mm_sub_ps(r01,rswitch);
1827             d                = _mm_max_ps(d,_mm_setzero_ps());
1828             d2               = _mm_mul_ps(d,d);
1829             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1830
1831             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1832
1833             /* Evaluate switch function */
1834             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1835             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1836             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1837
1838             fscal            = felec;
1839
1840             fscal            = _mm_and_ps(fscal,cutoff_mask);
1841
1842              /* Update vectorial force */
1843             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1844             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1845             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1846
1847             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1848             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1849             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1850
1851             }
1852
1853             /**************************
1854              * CALCULATE INTERACTIONS *
1855              **************************/
1856
1857             if (gmx_mm_any_lt(rsq02,rcutoff2))
1858             {
1859
1860             r02              = _mm_mul_ps(rsq02,rinv02);
1861
1862             /* EWALD ELECTROSTATICS */
1863
1864             /* Analytical PME correction */
1865             zeta2            = _mm_mul_ps(beta2,rsq02);
1866             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
1867             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1868             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1869             felec            = _mm_mul_ps(qq02,felec);
1870             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
1871             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
1872             velec            = _mm_mul_ps(qq02,velec);
1873
1874             d                = _mm_sub_ps(r02,rswitch);
1875             d                = _mm_max_ps(d,_mm_setzero_ps());
1876             d2               = _mm_mul_ps(d,d);
1877             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1878
1879             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1880
1881             /* Evaluate switch function */
1882             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1883             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1884             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1885
1886             fscal            = felec;
1887
1888             fscal            = _mm_and_ps(fscal,cutoff_mask);
1889
1890              /* Update vectorial force */
1891             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1892             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1893             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1894
1895             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1896             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1897             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1898
1899             }
1900
1901             /**************************
1902              * CALCULATE INTERACTIONS *
1903              **************************/
1904
1905             if (gmx_mm_any_lt(rsq10,rcutoff2))
1906             {
1907
1908             r10              = _mm_mul_ps(rsq10,rinv10);
1909
1910             /* EWALD ELECTROSTATICS */
1911
1912             /* Analytical PME correction */
1913             zeta2            = _mm_mul_ps(beta2,rsq10);
1914             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1915             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1916             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1917             felec            = _mm_mul_ps(qq10,felec);
1918             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
1919             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
1920             velec            = _mm_mul_ps(qq10,velec);
1921
1922             d                = _mm_sub_ps(r10,rswitch);
1923             d                = _mm_max_ps(d,_mm_setzero_ps());
1924             d2               = _mm_mul_ps(d,d);
1925             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1926
1927             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1928
1929             /* Evaluate switch function */
1930             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1931             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1932             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1933
1934             fscal            = felec;
1935
1936             fscal            = _mm_and_ps(fscal,cutoff_mask);
1937
1938              /* Update vectorial force */
1939             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1940             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1941             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1942
1943             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1944             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1945             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1946
1947             }
1948
1949             /**************************
1950              * CALCULATE INTERACTIONS *
1951              **************************/
1952
1953             if (gmx_mm_any_lt(rsq11,rcutoff2))
1954             {
1955
1956             r11              = _mm_mul_ps(rsq11,rinv11);
1957
1958             /* EWALD ELECTROSTATICS */
1959
1960             /* Analytical PME correction */
1961             zeta2            = _mm_mul_ps(beta2,rsq11);
1962             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1963             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
1964             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1965             felec            = _mm_mul_ps(qq11,felec);
1966             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
1967             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1968             velec            = _mm_mul_ps(qq11,velec);
1969
1970             d                = _mm_sub_ps(r11,rswitch);
1971             d                = _mm_max_ps(d,_mm_setzero_ps());
1972             d2               = _mm_mul_ps(d,d);
1973             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1974
1975             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1976
1977             /* Evaluate switch function */
1978             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1979             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1980             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1981
1982             fscal            = felec;
1983
1984             fscal            = _mm_and_ps(fscal,cutoff_mask);
1985
1986              /* Update vectorial force */
1987             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1988             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1989             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1990
1991             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1992             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1993             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1994
1995             }
1996
1997             /**************************
1998              * CALCULATE INTERACTIONS *
1999              **************************/
2000
2001             if (gmx_mm_any_lt(rsq12,rcutoff2))
2002             {
2003
2004             r12              = _mm_mul_ps(rsq12,rinv12);
2005
2006             /* EWALD ELECTROSTATICS */
2007
2008             /* Analytical PME correction */
2009             zeta2            = _mm_mul_ps(beta2,rsq12);
2010             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
2011             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
2012             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2013             felec            = _mm_mul_ps(qq12,felec);
2014             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
2015             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
2016             velec            = _mm_mul_ps(qq12,velec);
2017
2018             d                = _mm_sub_ps(r12,rswitch);
2019             d                = _mm_max_ps(d,_mm_setzero_ps());
2020             d2               = _mm_mul_ps(d,d);
2021             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2022
2023             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2024
2025             /* Evaluate switch function */
2026             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2027             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2028             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2029
2030             fscal            = felec;
2031
2032             fscal            = _mm_and_ps(fscal,cutoff_mask);
2033
2034              /* Update vectorial force */
2035             fix1             = _mm_macc_ps(dx12,fscal,fix1);
2036             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
2037             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
2038
2039             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
2040             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
2041             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
2042
2043             }
2044
2045             /**************************
2046              * CALCULATE INTERACTIONS *
2047              **************************/
2048
2049             if (gmx_mm_any_lt(rsq20,rcutoff2))
2050             {
2051
2052             r20              = _mm_mul_ps(rsq20,rinv20);
2053
2054             /* EWALD ELECTROSTATICS */
2055
2056             /* Analytical PME correction */
2057             zeta2            = _mm_mul_ps(beta2,rsq20);
2058             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
2059             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
2060             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2061             felec            = _mm_mul_ps(qq20,felec);
2062             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
2063             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
2064             velec            = _mm_mul_ps(qq20,velec);
2065
2066             d                = _mm_sub_ps(r20,rswitch);
2067             d                = _mm_max_ps(d,_mm_setzero_ps());
2068             d2               = _mm_mul_ps(d,d);
2069             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2070
2071             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2072
2073             /* Evaluate switch function */
2074             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2075             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2076             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2077
2078             fscal            = felec;
2079
2080             fscal            = _mm_and_ps(fscal,cutoff_mask);
2081
2082              /* Update vectorial force */
2083             fix2             = _mm_macc_ps(dx20,fscal,fix2);
2084             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
2085             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
2086
2087             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
2088             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
2089             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
2090
2091             }
2092
2093             /**************************
2094              * CALCULATE INTERACTIONS *
2095              **************************/
2096
2097             if (gmx_mm_any_lt(rsq21,rcutoff2))
2098             {
2099
2100             r21              = _mm_mul_ps(rsq21,rinv21);
2101
2102             /* EWALD ELECTROSTATICS */
2103
2104             /* Analytical PME correction */
2105             zeta2            = _mm_mul_ps(beta2,rsq21);
2106             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
2107             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
2108             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2109             felec            = _mm_mul_ps(qq21,felec);
2110             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
2111             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2112             velec            = _mm_mul_ps(qq21,velec);
2113
2114             d                = _mm_sub_ps(r21,rswitch);
2115             d                = _mm_max_ps(d,_mm_setzero_ps());
2116             d2               = _mm_mul_ps(d,d);
2117             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2118
2119             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2120
2121             /* Evaluate switch function */
2122             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2123             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2124             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2125
2126             fscal            = felec;
2127
2128             fscal            = _mm_and_ps(fscal,cutoff_mask);
2129
2130              /* Update vectorial force */
2131             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2132             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2133             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2134
2135             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2136             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2137             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2138
2139             }
2140
2141             /**************************
2142              * CALCULATE INTERACTIONS *
2143              **************************/
2144
2145             if (gmx_mm_any_lt(rsq22,rcutoff2))
2146             {
2147
2148             r22              = _mm_mul_ps(rsq22,rinv22);
2149
2150             /* EWALD ELECTROSTATICS */
2151
2152             /* Analytical PME correction */
2153             zeta2            = _mm_mul_ps(beta2,rsq22);
2154             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
2155             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
2156             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2157             felec            = _mm_mul_ps(qq22,felec);
2158             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
2159             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2160             velec            = _mm_mul_ps(qq22,velec);
2161
2162             d                = _mm_sub_ps(r22,rswitch);
2163             d                = _mm_max_ps(d,_mm_setzero_ps());
2164             d2               = _mm_mul_ps(d,d);
2165             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2166
2167             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2168
2169             /* Evaluate switch function */
2170             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2171             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2172             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2173
2174             fscal            = felec;
2175
2176             fscal            = _mm_and_ps(fscal,cutoff_mask);
2177
2178              /* Update vectorial force */
2179             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2180             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2181             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2182
2183             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2184             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2185             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2186
2187             }
2188
2189             fjptrA             = f+j_coord_offsetA;
2190             fjptrB             = f+j_coord_offsetB;
2191             fjptrC             = f+j_coord_offsetC;
2192             fjptrD             = f+j_coord_offsetD;
2193
2194             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2195                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2196
2197             /* Inner loop uses 465 flops */
2198         }
2199
2200         if(jidx<j_index_end)
2201         {
2202
2203             /* Get j neighbor index, and coordinate index */
2204             jnrlistA         = jjnr[jidx];
2205             jnrlistB         = jjnr[jidx+1];
2206             jnrlistC         = jjnr[jidx+2];
2207             jnrlistD         = jjnr[jidx+3];
2208             /* Sign of each element will be negative for non-real atoms.
2209              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2210              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2211              */
2212             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2213             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2214             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2215             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2216             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2217             j_coord_offsetA  = DIM*jnrA;
2218             j_coord_offsetB  = DIM*jnrB;
2219             j_coord_offsetC  = DIM*jnrC;
2220             j_coord_offsetD  = DIM*jnrD;
2221
2222             /* load j atom coordinates */
2223             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2224                                               x+j_coord_offsetC,x+j_coord_offsetD,
2225                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2226
2227             /* Calculate displacement vector */
2228             dx00             = _mm_sub_ps(ix0,jx0);
2229             dy00             = _mm_sub_ps(iy0,jy0);
2230             dz00             = _mm_sub_ps(iz0,jz0);
2231             dx01             = _mm_sub_ps(ix0,jx1);
2232             dy01             = _mm_sub_ps(iy0,jy1);
2233             dz01             = _mm_sub_ps(iz0,jz1);
2234             dx02             = _mm_sub_ps(ix0,jx2);
2235             dy02             = _mm_sub_ps(iy0,jy2);
2236             dz02             = _mm_sub_ps(iz0,jz2);
2237             dx10             = _mm_sub_ps(ix1,jx0);
2238             dy10             = _mm_sub_ps(iy1,jy0);
2239             dz10             = _mm_sub_ps(iz1,jz0);
2240             dx11             = _mm_sub_ps(ix1,jx1);
2241             dy11             = _mm_sub_ps(iy1,jy1);
2242             dz11             = _mm_sub_ps(iz1,jz1);
2243             dx12             = _mm_sub_ps(ix1,jx2);
2244             dy12             = _mm_sub_ps(iy1,jy2);
2245             dz12             = _mm_sub_ps(iz1,jz2);
2246             dx20             = _mm_sub_ps(ix2,jx0);
2247             dy20             = _mm_sub_ps(iy2,jy0);
2248             dz20             = _mm_sub_ps(iz2,jz0);
2249             dx21             = _mm_sub_ps(ix2,jx1);
2250             dy21             = _mm_sub_ps(iy2,jy1);
2251             dz21             = _mm_sub_ps(iz2,jz1);
2252             dx22             = _mm_sub_ps(ix2,jx2);
2253             dy22             = _mm_sub_ps(iy2,jy2);
2254             dz22             = _mm_sub_ps(iz2,jz2);
2255
2256             /* Calculate squared distance and things based on it */
2257             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2258             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2259             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2260             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2261             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2262             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2263             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2264             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2265             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2266
2267             rinv00           = avx128fma_invsqrt_f(rsq00);
2268             rinv01           = avx128fma_invsqrt_f(rsq01);
2269             rinv02           = avx128fma_invsqrt_f(rsq02);
2270             rinv10           = avx128fma_invsqrt_f(rsq10);
2271             rinv11           = avx128fma_invsqrt_f(rsq11);
2272             rinv12           = avx128fma_invsqrt_f(rsq12);
2273             rinv20           = avx128fma_invsqrt_f(rsq20);
2274             rinv21           = avx128fma_invsqrt_f(rsq21);
2275             rinv22           = avx128fma_invsqrt_f(rsq22);
2276
2277             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2278             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
2279             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
2280             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
2281             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2282             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2283             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
2284             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2285             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2286
2287             fjx0             = _mm_setzero_ps();
2288             fjy0             = _mm_setzero_ps();
2289             fjz0             = _mm_setzero_ps();
2290             fjx1             = _mm_setzero_ps();
2291             fjy1             = _mm_setzero_ps();
2292             fjz1             = _mm_setzero_ps();
2293             fjx2             = _mm_setzero_ps();
2294             fjy2             = _mm_setzero_ps();
2295             fjz2             = _mm_setzero_ps();
2296
2297             /**************************
2298              * CALCULATE INTERACTIONS *
2299              **************************/
2300
2301             if (gmx_mm_any_lt(rsq00,rcutoff2))
2302             {
2303
2304             r00              = _mm_mul_ps(rsq00,rinv00);
2305             r00              = _mm_andnot_ps(dummy_mask,r00);
2306
2307             /* EWALD ELECTROSTATICS */
2308
2309             /* Analytical PME correction */
2310             zeta2            = _mm_mul_ps(beta2,rsq00);
2311             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
2312             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
2313             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2314             felec            = _mm_mul_ps(qq00,felec);
2315             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
2316             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
2317             velec            = _mm_mul_ps(qq00,velec);
2318
2319             /* LENNARD-JONES DISPERSION/REPULSION */
2320
2321             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2322             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
2323             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2324             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
2325             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2326
2327             d                = _mm_sub_ps(r00,rswitch);
2328             d                = _mm_max_ps(d,_mm_setzero_ps());
2329             d2               = _mm_mul_ps(d,d);
2330             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2331
2332             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2333
2334             /* Evaluate switch function */
2335             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2336             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2337             fvdw             = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2338             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2339
2340             fscal            = _mm_add_ps(felec,fvdw);
2341
2342             fscal            = _mm_and_ps(fscal,cutoff_mask);
2343
2344             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2345
2346              /* Update vectorial force */
2347             fix0             = _mm_macc_ps(dx00,fscal,fix0);
2348             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
2349             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
2350
2351             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
2352             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
2353             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
2354
2355             }
2356
2357             /**************************
2358              * CALCULATE INTERACTIONS *
2359              **************************/
2360
2361             if (gmx_mm_any_lt(rsq01,rcutoff2))
2362             {
2363
2364             r01              = _mm_mul_ps(rsq01,rinv01);
2365             r01              = _mm_andnot_ps(dummy_mask,r01);
2366
2367             /* EWALD ELECTROSTATICS */
2368
2369             /* Analytical PME correction */
2370             zeta2            = _mm_mul_ps(beta2,rsq01);
2371             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
2372             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
2373             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2374             felec            = _mm_mul_ps(qq01,felec);
2375             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
2376             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
2377             velec            = _mm_mul_ps(qq01,velec);
2378
2379             d                = _mm_sub_ps(r01,rswitch);
2380             d                = _mm_max_ps(d,_mm_setzero_ps());
2381             d2               = _mm_mul_ps(d,d);
2382             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2383
2384             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2385
2386             /* Evaluate switch function */
2387             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2388             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2389             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
2390
2391             fscal            = felec;
2392
2393             fscal            = _mm_and_ps(fscal,cutoff_mask);
2394
2395             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2396
2397              /* Update vectorial force */
2398             fix0             = _mm_macc_ps(dx01,fscal,fix0);
2399             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
2400             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
2401
2402             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
2403             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
2404             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
2405
2406             }
2407
2408             /**************************
2409              * CALCULATE INTERACTIONS *
2410              **************************/
2411
2412             if (gmx_mm_any_lt(rsq02,rcutoff2))
2413             {
2414
2415             r02              = _mm_mul_ps(rsq02,rinv02);
2416             r02              = _mm_andnot_ps(dummy_mask,r02);
2417
2418             /* EWALD ELECTROSTATICS */
2419
2420             /* Analytical PME correction */
2421             zeta2            = _mm_mul_ps(beta2,rsq02);
2422             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
2423             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
2424             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2425             felec            = _mm_mul_ps(qq02,felec);
2426             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
2427             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
2428             velec            = _mm_mul_ps(qq02,velec);
2429
2430             d                = _mm_sub_ps(r02,rswitch);
2431             d                = _mm_max_ps(d,_mm_setzero_ps());
2432             d2               = _mm_mul_ps(d,d);
2433             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2434
2435             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2436
2437             /* Evaluate switch function */
2438             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2439             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2440             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
2441
2442             fscal            = felec;
2443
2444             fscal            = _mm_and_ps(fscal,cutoff_mask);
2445
2446             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2447
2448              /* Update vectorial force */
2449             fix0             = _mm_macc_ps(dx02,fscal,fix0);
2450             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
2451             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
2452
2453             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
2454             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
2455             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
2456
2457             }
2458
2459             /**************************
2460              * CALCULATE INTERACTIONS *
2461              **************************/
2462
2463             if (gmx_mm_any_lt(rsq10,rcutoff2))
2464             {
2465
2466             r10              = _mm_mul_ps(rsq10,rinv10);
2467             r10              = _mm_andnot_ps(dummy_mask,r10);
2468
2469             /* EWALD ELECTROSTATICS */
2470
2471             /* Analytical PME correction */
2472             zeta2            = _mm_mul_ps(beta2,rsq10);
2473             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
2474             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
2475             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2476             felec            = _mm_mul_ps(qq10,felec);
2477             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
2478             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
2479             velec            = _mm_mul_ps(qq10,velec);
2480
2481             d                = _mm_sub_ps(r10,rswitch);
2482             d                = _mm_max_ps(d,_mm_setzero_ps());
2483             d2               = _mm_mul_ps(d,d);
2484             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2485
2486             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2487
2488             /* Evaluate switch function */
2489             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2490             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2491             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
2492
2493             fscal            = felec;
2494
2495             fscal            = _mm_and_ps(fscal,cutoff_mask);
2496
2497             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2498
2499              /* Update vectorial force */
2500             fix1             = _mm_macc_ps(dx10,fscal,fix1);
2501             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
2502             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
2503
2504             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
2505             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
2506             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
2507
2508             }
2509
2510             /**************************
2511              * CALCULATE INTERACTIONS *
2512              **************************/
2513
2514             if (gmx_mm_any_lt(rsq11,rcutoff2))
2515             {
2516
2517             r11              = _mm_mul_ps(rsq11,rinv11);
2518             r11              = _mm_andnot_ps(dummy_mask,r11);
2519
2520             /* EWALD ELECTROSTATICS */
2521
2522             /* Analytical PME correction */
2523             zeta2            = _mm_mul_ps(beta2,rsq11);
2524             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
2525             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
2526             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2527             felec            = _mm_mul_ps(qq11,felec);
2528             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
2529             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
2530             velec            = _mm_mul_ps(qq11,velec);
2531
2532             d                = _mm_sub_ps(r11,rswitch);
2533             d                = _mm_max_ps(d,_mm_setzero_ps());
2534             d2               = _mm_mul_ps(d,d);
2535             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2536
2537             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2538
2539             /* Evaluate switch function */
2540             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2541             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2542             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2543
2544             fscal            = felec;
2545
2546             fscal            = _mm_and_ps(fscal,cutoff_mask);
2547
2548             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2549
2550              /* Update vectorial force */
2551             fix1             = _mm_macc_ps(dx11,fscal,fix1);
2552             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
2553             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
2554
2555             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
2556             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
2557             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
2558
2559             }
2560
2561             /**************************
2562              * CALCULATE INTERACTIONS *
2563              **************************/
2564
2565             if (gmx_mm_any_lt(rsq12,rcutoff2))
2566             {
2567
2568             r12              = _mm_mul_ps(rsq12,rinv12);
2569             r12              = _mm_andnot_ps(dummy_mask,r12);
2570
2571             /* EWALD ELECTROSTATICS */
2572
2573             /* Analytical PME correction */
2574             zeta2            = _mm_mul_ps(beta2,rsq12);
2575             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
2576             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
2577             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2578             felec            = _mm_mul_ps(qq12,felec);
2579             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
2580             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
2581             velec            = _mm_mul_ps(qq12,velec);
2582
2583             d                = _mm_sub_ps(r12,rswitch);
2584             d                = _mm_max_ps(d,_mm_setzero_ps());
2585             d2               = _mm_mul_ps(d,d);
2586             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2587
2588             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2589
2590             /* Evaluate switch function */
2591             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2592             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2593             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2594
2595             fscal            = felec;
2596
2597             fscal            = _mm_and_ps(fscal,cutoff_mask);
2598
2599             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2600
2601              /* Update vectorial force */
2602             fix1             = _mm_macc_ps(dx12,fscal,fix1);
2603             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
2604             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
2605
2606             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
2607             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
2608             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
2609
2610             }
2611
2612             /**************************
2613              * CALCULATE INTERACTIONS *
2614              **************************/
2615
2616             if (gmx_mm_any_lt(rsq20,rcutoff2))
2617             {
2618
2619             r20              = _mm_mul_ps(rsq20,rinv20);
2620             r20              = _mm_andnot_ps(dummy_mask,r20);
2621
2622             /* EWALD ELECTROSTATICS */
2623
2624             /* Analytical PME correction */
2625             zeta2            = _mm_mul_ps(beta2,rsq20);
2626             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
2627             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
2628             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2629             felec            = _mm_mul_ps(qq20,felec);
2630             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
2631             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
2632             velec            = _mm_mul_ps(qq20,velec);
2633
2634             d                = _mm_sub_ps(r20,rswitch);
2635             d                = _mm_max_ps(d,_mm_setzero_ps());
2636             d2               = _mm_mul_ps(d,d);
2637             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2638
2639             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2640
2641             /* Evaluate switch function */
2642             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2643             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2644             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2645
2646             fscal            = felec;
2647
2648             fscal            = _mm_and_ps(fscal,cutoff_mask);
2649
2650             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2651
2652              /* Update vectorial force */
2653             fix2             = _mm_macc_ps(dx20,fscal,fix2);
2654             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
2655             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
2656
2657             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
2658             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
2659             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
2660
2661             }
2662
2663             /**************************
2664              * CALCULATE INTERACTIONS *
2665              **************************/
2666
2667             if (gmx_mm_any_lt(rsq21,rcutoff2))
2668             {
2669
2670             r21              = _mm_mul_ps(rsq21,rinv21);
2671             r21              = _mm_andnot_ps(dummy_mask,r21);
2672
2673             /* EWALD ELECTROSTATICS */
2674
2675             /* Analytical PME correction */
2676             zeta2            = _mm_mul_ps(beta2,rsq21);
2677             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
2678             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
2679             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2680             felec            = _mm_mul_ps(qq21,felec);
2681             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
2682             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2683             velec            = _mm_mul_ps(qq21,velec);
2684
2685             d                = _mm_sub_ps(r21,rswitch);
2686             d                = _mm_max_ps(d,_mm_setzero_ps());
2687             d2               = _mm_mul_ps(d,d);
2688             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2689
2690             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2691
2692             /* Evaluate switch function */
2693             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2694             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2695             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2696
2697             fscal            = felec;
2698
2699             fscal            = _mm_and_ps(fscal,cutoff_mask);
2700
2701             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2702
2703              /* Update vectorial force */
2704             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2705             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2706             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2707
2708             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2709             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2710             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2711
2712             }
2713
2714             /**************************
2715              * CALCULATE INTERACTIONS *
2716              **************************/
2717
2718             if (gmx_mm_any_lt(rsq22,rcutoff2))
2719             {
2720
2721             r22              = _mm_mul_ps(rsq22,rinv22);
2722             r22              = _mm_andnot_ps(dummy_mask,r22);
2723
2724             /* EWALD ELECTROSTATICS */
2725
2726             /* Analytical PME correction */
2727             zeta2            = _mm_mul_ps(beta2,rsq22);
2728             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
2729             pmecorrF         = avx128fma_pmecorrF_f(zeta2);
2730             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2731             felec            = _mm_mul_ps(qq22,felec);
2732             pmecorrV         = avx128fma_pmecorrV_f(zeta2);
2733             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2734             velec            = _mm_mul_ps(qq22,velec);
2735
2736             d                = _mm_sub_ps(r22,rswitch);
2737             d                = _mm_max_ps(d,_mm_setzero_ps());
2738             d2               = _mm_mul_ps(d,d);
2739             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2740
2741             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2742
2743             /* Evaluate switch function */
2744             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2745             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2746             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2747
2748             fscal            = felec;
2749
2750             fscal            = _mm_and_ps(fscal,cutoff_mask);
2751
2752             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2753
2754              /* Update vectorial force */
2755             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2756             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2757             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2758
2759             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2760             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2761             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2762
2763             }
2764
2765             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2766             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2767             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2768             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2769
2770             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2771                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2772
2773             /* Inner loop uses 474 flops */
2774         }
2775
2776         /* End of innermost loop */
2777
2778         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2779                                               f+i_coord_offset,fshift+i_shift_offset);
2780
2781         /* Increment number of inner iterations */
2782         inneriter                  += j_index_end - j_index_start;
2783
2784         /* Outer loop uses 18 flops */
2785     }
2786
2787     /* Increment number of outer iterations */
2788     outeriter        += nri;
2789
2790     /* Update outer/inner flops */
2791
2792     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*474);
2793 }