96e31a347e2afd9712bfd9783e0006c1ec99abe5
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
48 #include "kernelutil_x86_avx_128_fma_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_single
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
105     real             *charge;
106     int              nvdwtype;
107     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
108     int              *vdwtype;
109     real             *vdwparam;
110     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
111     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
112     __m128i          ewitab;
113     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
114     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
115     real             *ewtab;
116     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
117     real             rswitch_scalar,d_scalar;
118     __m128           dummy_mask,cutoff_mask;
119     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
120     __m128           one     = _mm_set1_ps(1.0);
121     __m128           two     = _mm_set1_ps(2.0);
122     x                = xx[0];
123     f                = ff[0];
124
125     nri              = nlist->nri;
126     iinr             = nlist->iinr;
127     jindex           = nlist->jindex;
128     jjnr             = nlist->jjnr;
129     shiftidx         = nlist->shift;
130     gid              = nlist->gid;
131     shiftvec         = fr->shift_vec[0];
132     fshift           = fr->fshift[0];
133     facel            = _mm_set1_ps(fr->epsfac);
134     charge           = mdatoms->chargeA;
135     nvdwtype         = fr->ntype;
136     vdwparam         = fr->nbfp;
137     vdwtype          = mdatoms->typeA;
138
139     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
140     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
141     beta2            = _mm_mul_ps(beta,beta);
142     beta3            = _mm_mul_ps(beta,beta2);
143     ewtab            = fr->ic->tabq_coul_FDV0;
144     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
145     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
146
147     /* Setup water-specific parameters */
148     inr              = nlist->iinr[0];
149     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
150     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
151     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
152     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
153
154     jq0              = _mm_set1_ps(charge[inr+0]);
155     jq1              = _mm_set1_ps(charge[inr+1]);
156     jq2              = _mm_set1_ps(charge[inr+2]);
157     vdwjidx0A        = 2*vdwtype[inr+0];
158     qq00             = _mm_mul_ps(iq0,jq0);
159     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
160     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
161     qq01             = _mm_mul_ps(iq0,jq1);
162     qq02             = _mm_mul_ps(iq0,jq2);
163     qq10             = _mm_mul_ps(iq1,jq0);
164     qq11             = _mm_mul_ps(iq1,jq1);
165     qq12             = _mm_mul_ps(iq1,jq2);
166     qq20             = _mm_mul_ps(iq2,jq0);
167     qq21             = _mm_mul_ps(iq2,jq1);
168     qq22             = _mm_mul_ps(iq2,jq2);
169
170     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
171     rcutoff_scalar   = fr->rcoulomb;
172     rcutoff          = _mm_set1_ps(rcutoff_scalar);
173     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
174
175     rswitch_scalar   = fr->rcoulomb_switch;
176     rswitch          = _mm_set1_ps(rswitch_scalar);
177     /* Setup switch parameters */
178     d_scalar         = rcutoff_scalar-rswitch_scalar;
179     d                = _mm_set1_ps(d_scalar);
180     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
181     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
182     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
183     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
184     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
185     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
186
187     /* Avoid stupid compiler warnings */
188     jnrA = jnrB = jnrC = jnrD = 0;
189     j_coord_offsetA = 0;
190     j_coord_offsetB = 0;
191     j_coord_offsetC = 0;
192     j_coord_offsetD = 0;
193
194     outeriter        = 0;
195     inneriter        = 0;
196
197     for(iidx=0;iidx<4*DIM;iidx++)
198     {
199         scratch[iidx] = 0.0;
200     }
201
202     /* Start outer loop over neighborlists */
203     for(iidx=0; iidx<nri; iidx++)
204     {
205         /* Load shift vector for this list */
206         i_shift_offset   = DIM*shiftidx[iidx];
207
208         /* Load limits for loop over neighbors */
209         j_index_start    = jindex[iidx];
210         j_index_end      = jindex[iidx+1];
211
212         /* Get outer coordinate index */
213         inr              = iinr[iidx];
214         i_coord_offset   = DIM*inr;
215
216         /* Load i particle coords and add shift vector */
217         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
218                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
219
220         fix0             = _mm_setzero_ps();
221         fiy0             = _mm_setzero_ps();
222         fiz0             = _mm_setzero_ps();
223         fix1             = _mm_setzero_ps();
224         fiy1             = _mm_setzero_ps();
225         fiz1             = _mm_setzero_ps();
226         fix2             = _mm_setzero_ps();
227         fiy2             = _mm_setzero_ps();
228         fiz2             = _mm_setzero_ps();
229
230         /* Reset potential sums */
231         velecsum         = _mm_setzero_ps();
232         vvdwsum          = _mm_setzero_ps();
233
234         /* Start inner kernel loop */
235         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
236         {
237
238             /* Get j neighbor index, and coordinate index */
239             jnrA             = jjnr[jidx];
240             jnrB             = jjnr[jidx+1];
241             jnrC             = jjnr[jidx+2];
242             jnrD             = jjnr[jidx+3];
243             j_coord_offsetA  = DIM*jnrA;
244             j_coord_offsetB  = DIM*jnrB;
245             j_coord_offsetC  = DIM*jnrC;
246             j_coord_offsetD  = DIM*jnrD;
247
248             /* load j atom coordinates */
249             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
250                                               x+j_coord_offsetC,x+j_coord_offsetD,
251                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
252
253             /* Calculate displacement vector */
254             dx00             = _mm_sub_ps(ix0,jx0);
255             dy00             = _mm_sub_ps(iy0,jy0);
256             dz00             = _mm_sub_ps(iz0,jz0);
257             dx01             = _mm_sub_ps(ix0,jx1);
258             dy01             = _mm_sub_ps(iy0,jy1);
259             dz01             = _mm_sub_ps(iz0,jz1);
260             dx02             = _mm_sub_ps(ix0,jx2);
261             dy02             = _mm_sub_ps(iy0,jy2);
262             dz02             = _mm_sub_ps(iz0,jz2);
263             dx10             = _mm_sub_ps(ix1,jx0);
264             dy10             = _mm_sub_ps(iy1,jy0);
265             dz10             = _mm_sub_ps(iz1,jz0);
266             dx11             = _mm_sub_ps(ix1,jx1);
267             dy11             = _mm_sub_ps(iy1,jy1);
268             dz11             = _mm_sub_ps(iz1,jz1);
269             dx12             = _mm_sub_ps(ix1,jx2);
270             dy12             = _mm_sub_ps(iy1,jy2);
271             dz12             = _mm_sub_ps(iz1,jz2);
272             dx20             = _mm_sub_ps(ix2,jx0);
273             dy20             = _mm_sub_ps(iy2,jy0);
274             dz20             = _mm_sub_ps(iz2,jz0);
275             dx21             = _mm_sub_ps(ix2,jx1);
276             dy21             = _mm_sub_ps(iy2,jy1);
277             dz21             = _mm_sub_ps(iz2,jz1);
278             dx22             = _mm_sub_ps(ix2,jx2);
279             dy22             = _mm_sub_ps(iy2,jy2);
280             dz22             = _mm_sub_ps(iz2,jz2);
281
282             /* Calculate squared distance and things based on it */
283             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
284             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
285             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
286             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
287             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
288             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
289             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
290             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
291             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
292
293             rinv00           = gmx_mm_invsqrt_ps(rsq00);
294             rinv01           = gmx_mm_invsqrt_ps(rsq01);
295             rinv02           = gmx_mm_invsqrt_ps(rsq02);
296             rinv10           = gmx_mm_invsqrt_ps(rsq10);
297             rinv11           = gmx_mm_invsqrt_ps(rsq11);
298             rinv12           = gmx_mm_invsqrt_ps(rsq12);
299             rinv20           = gmx_mm_invsqrt_ps(rsq20);
300             rinv21           = gmx_mm_invsqrt_ps(rsq21);
301             rinv22           = gmx_mm_invsqrt_ps(rsq22);
302
303             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
304             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
305             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
306             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
307             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
308             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
309             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
310             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
311             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
312
313             fjx0             = _mm_setzero_ps();
314             fjy0             = _mm_setzero_ps();
315             fjz0             = _mm_setzero_ps();
316             fjx1             = _mm_setzero_ps();
317             fjy1             = _mm_setzero_ps();
318             fjz1             = _mm_setzero_ps();
319             fjx2             = _mm_setzero_ps();
320             fjy2             = _mm_setzero_ps();
321             fjz2             = _mm_setzero_ps();
322
323             /**************************
324              * CALCULATE INTERACTIONS *
325              **************************/
326
327             if (gmx_mm_any_lt(rsq00,rcutoff2))
328             {
329
330             r00              = _mm_mul_ps(rsq00,rinv00);
331
332             /* EWALD ELECTROSTATICS */
333
334             /* Analytical PME correction */
335             zeta2            = _mm_mul_ps(beta2,rsq00);
336             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
337             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
338             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
339             felec            = _mm_mul_ps(qq00,felec);
340             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
341             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
342             velec            = _mm_mul_ps(qq00,velec);
343
344             /* LENNARD-JONES DISPERSION/REPULSION */
345
346             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
347             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
348             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
349             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
350             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
351
352             d                = _mm_sub_ps(r00,rswitch);
353             d                = _mm_max_ps(d,_mm_setzero_ps());
354             d2               = _mm_mul_ps(d,d);
355             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
356
357             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
358
359             /* Evaluate switch function */
360             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
361             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
362             fvdw             = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
363             velec            = _mm_mul_ps(velec,sw);
364             vvdw             = _mm_mul_ps(vvdw,sw);
365             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
366
367             /* Update potential sum for this i atom from the interaction with this j atom. */
368             velec            = _mm_and_ps(velec,cutoff_mask);
369             velecsum         = _mm_add_ps(velecsum,velec);
370             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
371             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
372
373             fscal            = _mm_add_ps(felec,fvdw);
374
375             fscal            = _mm_and_ps(fscal,cutoff_mask);
376
377              /* Update vectorial force */
378             fix0             = _mm_macc_ps(dx00,fscal,fix0);
379             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
380             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
381
382             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
383             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
384             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
385
386             }
387
388             /**************************
389              * CALCULATE INTERACTIONS *
390              **************************/
391
392             if (gmx_mm_any_lt(rsq01,rcutoff2))
393             {
394
395             r01              = _mm_mul_ps(rsq01,rinv01);
396
397             /* EWALD ELECTROSTATICS */
398
399             /* Analytical PME correction */
400             zeta2            = _mm_mul_ps(beta2,rsq01);
401             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
402             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
403             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
404             felec            = _mm_mul_ps(qq01,felec);
405             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
406             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
407             velec            = _mm_mul_ps(qq01,velec);
408
409             d                = _mm_sub_ps(r01,rswitch);
410             d                = _mm_max_ps(d,_mm_setzero_ps());
411             d2               = _mm_mul_ps(d,d);
412             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
413
414             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
415
416             /* Evaluate switch function */
417             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
418             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
419             velec            = _mm_mul_ps(velec,sw);
420             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
421
422             /* Update potential sum for this i atom from the interaction with this j atom. */
423             velec            = _mm_and_ps(velec,cutoff_mask);
424             velecsum         = _mm_add_ps(velecsum,velec);
425
426             fscal            = felec;
427
428             fscal            = _mm_and_ps(fscal,cutoff_mask);
429
430              /* Update vectorial force */
431             fix0             = _mm_macc_ps(dx01,fscal,fix0);
432             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
433             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
434
435             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
436             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
437             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
438
439             }
440
441             /**************************
442              * CALCULATE INTERACTIONS *
443              **************************/
444
445             if (gmx_mm_any_lt(rsq02,rcutoff2))
446             {
447
448             r02              = _mm_mul_ps(rsq02,rinv02);
449
450             /* EWALD ELECTROSTATICS */
451
452             /* Analytical PME correction */
453             zeta2            = _mm_mul_ps(beta2,rsq02);
454             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
455             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
456             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
457             felec            = _mm_mul_ps(qq02,felec);
458             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
459             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
460             velec            = _mm_mul_ps(qq02,velec);
461
462             d                = _mm_sub_ps(r02,rswitch);
463             d                = _mm_max_ps(d,_mm_setzero_ps());
464             d2               = _mm_mul_ps(d,d);
465             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
466
467             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
468
469             /* Evaluate switch function */
470             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
471             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
472             velec            = _mm_mul_ps(velec,sw);
473             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
474
475             /* Update potential sum for this i atom from the interaction with this j atom. */
476             velec            = _mm_and_ps(velec,cutoff_mask);
477             velecsum         = _mm_add_ps(velecsum,velec);
478
479             fscal            = felec;
480
481             fscal            = _mm_and_ps(fscal,cutoff_mask);
482
483              /* Update vectorial force */
484             fix0             = _mm_macc_ps(dx02,fscal,fix0);
485             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
486             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
487
488             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
489             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
490             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
491
492             }
493
494             /**************************
495              * CALCULATE INTERACTIONS *
496              **************************/
497
498             if (gmx_mm_any_lt(rsq10,rcutoff2))
499             {
500
501             r10              = _mm_mul_ps(rsq10,rinv10);
502
503             /* EWALD ELECTROSTATICS */
504
505             /* Analytical PME correction */
506             zeta2            = _mm_mul_ps(beta2,rsq10);
507             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
508             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
509             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
510             felec            = _mm_mul_ps(qq10,felec);
511             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
512             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
513             velec            = _mm_mul_ps(qq10,velec);
514
515             d                = _mm_sub_ps(r10,rswitch);
516             d                = _mm_max_ps(d,_mm_setzero_ps());
517             d2               = _mm_mul_ps(d,d);
518             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
519
520             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
521
522             /* Evaluate switch function */
523             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
524             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
525             velec            = _mm_mul_ps(velec,sw);
526             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
527
528             /* Update potential sum for this i atom from the interaction with this j atom. */
529             velec            = _mm_and_ps(velec,cutoff_mask);
530             velecsum         = _mm_add_ps(velecsum,velec);
531
532             fscal            = felec;
533
534             fscal            = _mm_and_ps(fscal,cutoff_mask);
535
536              /* Update vectorial force */
537             fix1             = _mm_macc_ps(dx10,fscal,fix1);
538             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
539             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
540
541             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
542             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
543             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
544
545             }
546
547             /**************************
548              * CALCULATE INTERACTIONS *
549              **************************/
550
551             if (gmx_mm_any_lt(rsq11,rcutoff2))
552             {
553
554             r11              = _mm_mul_ps(rsq11,rinv11);
555
556             /* EWALD ELECTROSTATICS */
557
558             /* Analytical PME correction */
559             zeta2            = _mm_mul_ps(beta2,rsq11);
560             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
561             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
562             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
563             felec            = _mm_mul_ps(qq11,felec);
564             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
565             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
566             velec            = _mm_mul_ps(qq11,velec);
567
568             d                = _mm_sub_ps(r11,rswitch);
569             d                = _mm_max_ps(d,_mm_setzero_ps());
570             d2               = _mm_mul_ps(d,d);
571             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
572
573             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
574
575             /* Evaluate switch function */
576             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
577             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
578             velec            = _mm_mul_ps(velec,sw);
579             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
580
581             /* Update potential sum for this i atom from the interaction with this j atom. */
582             velec            = _mm_and_ps(velec,cutoff_mask);
583             velecsum         = _mm_add_ps(velecsum,velec);
584
585             fscal            = felec;
586
587             fscal            = _mm_and_ps(fscal,cutoff_mask);
588
589              /* Update vectorial force */
590             fix1             = _mm_macc_ps(dx11,fscal,fix1);
591             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
592             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
593
594             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
595             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
596             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
597
598             }
599
600             /**************************
601              * CALCULATE INTERACTIONS *
602              **************************/
603
604             if (gmx_mm_any_lt(rsq12,rcutoff2))
605             {
606
607             r12              = _mm_mul_ps(rsq12,rinv12);
608
609             /* EWALD ELECTROSTATICS */
610
611             /* Analytical PME correction */
612             zeta2            = _mm_mul_ps(beta2,rsq12);
613             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
614             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
615             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
616             felec            = _mm_mul_ps(qq12,felec);
617             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
618             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
619             velec            = _mm_mul_ps(qq12,velec);
620
621             d                = _mm_sub_ps(r12,rswitch);
622             d                = _mm_max_ps(d,_mm_setzero_ps());
623             d2               = _mm_mul_ps(d,d);
624             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
625
626             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
627
628             /* Evaluate switch function */
629             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
630             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
631             velec            = _mm_mul_ps(velec,sw);
632             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
633
634             /* Update potential sum for this i atom from the interaction with this j atom. */
635             velec            = _mm_and_ps(velec,cutoff_mask);
636             velecsum         = _mm_add_ps(velecsum,velec);
637
638             fscal            = felec;
639
640             fscal            = _mm_and_ps(fscal,cutoff_mask);
641
642              /* Update vectorial force */
643             fix1             = _mm_macc_ps(dx12,fscal,fix1);
644             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
645             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
646
647             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
648             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
649             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
650
651             }
652
653             /**************************
654              * CALCULATE INTERACTIONS *
655              **************************/
656
657             if (gmx_mm_any_lt(rsq20,rcutoff2))
658             {
659
660             r20              = _mm_mul_ps(rsq20,rinv20);
661
662             /* EWALD ELECTROSTATICS */
663
664             /* Analytical PME correction */
665             zeta2            = _mm_mul_ps(beta2,rsq20);
666             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
667             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
668             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
669             felec            = _mm_mul_ps(qq20,felec);
670             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
671             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
672             velec            = _mm_mul_ps(qq20,velec);
673
674             d                = _mm_sub_ps(r20,rswitch);
675             d                = _mm_max_ps(d,_mm_setzero_ps());
676             d2               = _mm_mul_ps(d,d);
677             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
678
679             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
680
681             /* Evaluate switch function */
682             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
683             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
684             velec            = _mm_mul_ps(velec,sw);
685             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
686
687             /* Update potential sum for this i atom from the interaction with this j atom. */
688             velec            = _mm_and_ps(velec,cutoff_mask);
689             velecsum         = _mm_add_ps(velecsum,velec);
690
691             fscal            = felec;
692
693             fscal            = _mm_and_ps(fscal,cutoff_mask);
694
695              /* Update vectorial force */
696             fix2             = _mm_macc_ps(dx20,fscal,fix2);
697             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
698             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
699
700             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
701             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
702             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
703
704             }
705
706             /**************************
707              * CALCULATE INTERACTIONS *
708              **************************/
709
710             if (gmx_mm_any_lt(rsq21,rcutoff2))
711             {
712
713             r21              = _mm_mul_ps(rsq21,rinv21);
714
715             /* EWALD ELECTROSTATICS */
716
717             /* Analytical PME correction */
718             zeta2            = _mm_mul_ps(beta2,rsq21);
719             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
720             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
721             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
722             felec            = _mm_mul_ps(qq21,felec);
723             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
724             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
725             velec            = _mm_mul_ps(qq21,velec);
726
727             d                = _mm_sub_ps(r21,rswitch);
728             d                = _mm_max_ps(d,_mm_setzero_ps());
729             d2               = _mm_mul_ps(d,d);
730             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
731
732             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
733
734             /* Evaluate switch function */
735             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
736             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
737             velec            = _mm_mul_ps(velec,sw);
738             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
739
740             /* Update potential sum for this i atom from the interaction with this j atom. */
741             velec            = _mm_and_ps(velec,cutoff_mask);
742             velecsum         = _mm_add_ps(velecsum,velec);
743
744             fscal            = felec;
745
746             fscal            = _mm_and_ps(fscal,cutoff_mask);
747
748              /* Update vectorial force */
749             fix2             = _mm_macc_ps(dx21,fscal,fix2);
750             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
751             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
752
753             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
754             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
755             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
756
757             }
758
759             /**************************
760              * CALCULATE INTERACTIONS *
761              **************************/
762
763             if (gmx_mm_any_lt(rsq22,rcutoff2))
764             {
765
766             r22              = _mm_mul_ps(rsq22,rinv22);
767
768             /* EWALD ELECTROSTATICS */
769
770             /* Analytical PME correction */
771             zeta2            = _mm_mul_ps(beta2,rsq22);
772             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
773             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
774             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
775             felec            = _mm_mul_ps(qq22,felec);
776             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
777             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
778             velec            = _mm_mul_ps(qq22,velec);
779
780             d                = _mm_sub_ps(r22,rswitch);
781             d                = _mm_max_ps(d,_mm_setzero_ps());
782             d2               = _mm_mul_ps(d,d);
783             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
784
785             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
786
787             /* Evaluate switch function */
788             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
789             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
790             velec            = _mm_mul_ps(velec,sw);
791             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
792
793             /* Update potential sum for this i atom from the interaction with this j atom. */
794             velec            = _mm_and_ps(velec,cutoff_mask);
795             velecsum         = _mm_add_ps(velecsum,velec);
796
797             fscal            = felec;
798
799             fscal            = _mm_and_ps(fscal,cutoff_mask);
800
801              /* Update vectorial force */
802             fix2             = _mm_macc_ps(dx22,fscal,fix2);
803             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
804             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
805
806             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
807             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
808             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
809
810             }
811
812             fjptrA             = f+j_coord_offsetA;
813             fjptrB             = f+j_coord_offsetB;
814             fjptrC             = f+j_coord_offsetC;
815             fjptrD             = f+j_coord_offsetD;
816
817             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
818                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
819
820             /* Inner loop uses 495 flops */
821         }
822
823         if(jidx<j_index_end)
824         {
825
826             /* Get j neighbor index, and coordinate index */
827             jnrlistA         = jjnr[jidx];
828             jnrlistB         = jjnr[jidx+1];
829             jnrlistC         = jjnr[jidx+2];
830             jnrlistD         = jjnr[jidx+3];
831             /* Sign of each element will be negative for non-real atoms.
832              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
833              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
834              */
835             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
836             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
837             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
838             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
839             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
840             j_coord_offsetA  = DIM*jnrA;
841             j_coord_offsetB  = DIM*jnrB;
842             j_coord_offsetC  = DIM*jnrC;
843             j_coord_offsetD  = DIM*jnrD;
844
845             /* load j atom coordinates */
846             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
847                                               x+j_coord_offsetC,x+j_coord_offsetD,
848                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
849
850             /* Calculate displacement vector */
851             dx00             = _mm_sub_ps(ix0,jx0);
852             dy00             = _mm_sub_ps(iy0,jy0);
853             dz00             = _mm_sub_ps(iz0,jz0);
854             dx01             = _mm_sub_ps(ix0,jx1);
855             dy01             = _mm_sub_ps(iy0,jy1);
856             dz01             = _mm_sub_ps(iz0,jz1);
857             dx02             = _mm_sub_ps(ix0,jx2);
858             dy02             = _mm_sub_ps(iy0,jy2);
859             dz02             = _mm_sub_ps(iz0,jz2);
860             dx10             = _mm_sub_ps(ix1,jx0);
861             dy10             = _mm_sub_ps(iy1,jy0);
862             dz10             = _mm_sub_ps(iz1,jz0);
863             dx11             = _mm_sub_ps(ix1,jx1);
864             dy11             = _mm_sub_ps(iy1,jy1);
865             dz11             = _mm_sub_ps(iz1,jz1);
866             dx12             = _mm_sub_ps(ix1,jx2);
867             dy12             = _mm_sub_ps(iy1,jy2);
868             dz12             = _mm_sub_ps(iz1,jz2);
869             dx20             = _mm_sub_ps(ix2,jx0);
870             dy20             = _mm_sub_ps(iy2,jy0);
871             dz20             = _mm_sub_ps(iz2,jz0);
872             dx21             = _mm_sub_ps(ix2,jx1);
873             dy21             = _mm_sub_ps(iy2,jy1);
874             dz21             = _mm_sub_ps(iz2,jz1);
875             dx22             = _mm_sub_ps(ix2,jx2);
876             dy22             = _mm_sub_ps(iy2,jy2);
877             dz22             = _mm_sub_ps(iz2,jz2);
878
879             /* Calculate squared distance and things based on it */
880             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
881             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
882             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
883             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
884             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
885             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
886             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
887             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
888             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
889
890             rinv00           = gmx_mm_invsqrt_ps(rsq00);
891             rinv01           = gmx_mm_invsqrt_ps(rsq01);
892             rinv02           = gmx_mm_invsqrt_ps(rsq02);
893             rinv10           = gmx_mm_invsqrt_ps(rsq10);
894             rinv11           = gmx_mm_invsqrt_ps(rsq11);
895             rinv12           = gmx_mm_invsqrt_ps(rsq12);
896             rinv20           = gmx_mm_invsqrt_ps(rsq20);
897             rinv21           = gmx_mm_invsqrt_ps(rsq21);
898             rinv22           = gmx_mm_invsqrt_ps(rsq22);
899
900             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
901             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
902             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
903             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
904             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
905             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
906             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
907             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
908             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
909
910             fjx0             = _mm_setzero_ps();
911             fjy0             = _mm_setzero_ps();
912             fjz0             = _mm_setzero_ps();
913             fjx1             = _mm_setzero_ps();
914             fjy1             = _mm_setzero_ps();
915             fjz1             = _mm_setzero_ps();
916             fjx2             = _mm_setzero_ps();
917             fjy2             = _mm_setzero_ps();
918             fjz2             = _mm_setzero_ps();
919
920             /**************************
921              * CALCULATE INTERACTIONS *
922              **************************/
923
924             if (gmx_mm_any_lt(rsq00,rcutoff2))
925             {
926
927             r00              = _mm_mul_ps(rsq00,rinv00);
928             r00              = _mm_andnot_ps(dummy_mask,r00);
929
930             /* EWALD ELECTROSTATICS */
931
932             /* Analytical PME correction */
933             zeta2            = _mm_mul_ps(beta2,rsq00);
934             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
935             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
936             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
937             felec            = _mm_mul_ps(qq00,felec);
938             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
939             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
940             velec            = _mm_mul_ps(qq00,velec);
941
942             /* LENNARD-JONES DISPERSION/REPULSION */
943
944             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
945             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
946             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
947             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
948             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
949
950             d                = _mm_sub_ps(r00,rswitch);
951             d                = _mm_max_ps(d,_mm_setzero_ps());
952             d2               = _mm_mul_ps(d,d);
953             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
954
955             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
956
957             /* Evaluate switch function */
958             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
959             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
960             fvdw             = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
961             velec            = _mm_mul_ps(velec,sw);
962             vvdw             = _mm_mul_ps(vvdw,sw);
963             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
964
965             /* Update potential sum for this i atom from the interaction with this j atom. */
966             velec            = _mm_and_ps(velec,cutoff_mask);
967             velec            = _mm_andnot_ps(dummy_mask,velec);
968             velecsum         = _mm_add_ps(velecsum,velec);
969             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
970             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
971             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
972
973             fscal            = _mm_add_ps(felec,fvdw);
974
975             fscal            = _mm_and_ps(fscal,cutoff_mask);
976
977             fscal            = _mm_andnot_ps(dummy_mask,fscal);
978
979              /* Update vectorial force */
980             fix0             = _mm_macc_ps(dx00,fscal,fix0);
981             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
982             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
983
984             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
985             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
986             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
987
988             }
989
990             /**************************
991              * CALCULATE INTERACTIONS *
992              **************************/
993
994             if (gmx_mm_any_lt(rsq01,rcutoff2))
995             {
996
997             r01              = _mm_mul_ps(rsq01,rinv01);
998             r01              = _mm_andnot_ps(dummy_mask,r01);
999
1000             /* EWALD ELECTROSTATICS */
1001
1002             /* Analytical PME correction */
1003             zeta2            = _mm_mul_ps(beta2,rsq01);
1004             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
1005             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1006             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1007             felec            = _mm_mul_ps(qq01,felec);
1008             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1009             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
1010             velec            = _mm_mul_ps(qq01,velec);
1011
1012             d                = _mm_sub_ps(r01,rswitch);
1013             d                = _mm_max_ps(d,_mm_setzero_ps());
1014             d2               = _mm_mul_ps(d,d);
1015             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1016
1017             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1018
1019             /* Evaluate switch function */
1020             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1021             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1022             velec            = _mm_mul_ps(velec,sw);
1023             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1024
1025             /* Update potential sum for this i atom from the interaction with this j atom. */
1026             velec            = _mm_and_ps(velec,cutoff_mask);
1027             velec            = _mm_andnot_ps(dummy_mask,velec);
1028             velecsum         = _mm_add_ps(velecsum,velec);
1029
1030             fscal            = felec;
1031
1032             fscal            = _mm_and_ps(fscal,cutoff_mask);
1033
1034             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1035
1036              /* Update vectorial force */
1037             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1038             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1039             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1040
1041             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1042             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1043             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1044
1045             }
1046
1047             /**************************
1048              * CALCULATE INTERACTIONS *
1049              **************************/
1050
1051             if (gmx_mm_any_lt(rsq02,rcutoff2))
1052             {
1053
1054             r02              = _mm_mul_ps(rsq02,rinv02);
1055             r02              = _mm_andnot_ps(dummy_mask,r02);
1056
1057             /* EWALD ELECTROSTATICS */
1058
1059             /* Analytical PME correction */
1060             zeta2            = _mm_mul_ps(beta2,rsq02);
1061             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
1062             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1063             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1064             felec            = _mm_mul_ps(qq02,felec);
1065             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1066             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
1067             velec            = _mm_mul_ps(qq02,velec);
1068
1069             d                = _mm_sub_ps(r02,rswitch);
1070             d                = _mm_max_ps(d,_mm_setzero_ps());
1071             d2               = _mm_mul_ps(d,d);
1072             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1073
1074             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1075
1076             /* Evaluate switch function */
1077             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1078             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1079             velec            = _mm_mul_ps(velec,sw);
1080             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1081
1082             /* Update potential sum for this i atom from the interaction with this j atom. */
1083             velec            = _mm_and_ps(velec,cutoff_mask);
1084             velec            = _mm_andnot_ps(dummy_mask,velec);
1085             velecsum         = _mm_add_ps(velecsum,velec);
1086
1087             fscal            = felec;
1088
1089             fscal            = _mm_and_ps(fscal,cutoff_mask);
1090
1091             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1092
1093              /* Update vectorial force */
1094             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1095             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1096             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1097
1098             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1099             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1100             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1101
1102             }
1103
1104             /**************************
1105              * CALCULATE INTERACTIONS *
1106              **************************/
1107
1108             if (gmx_mm_any_lt(rsq10,rcutoff2))
1109             {
1110
1111             r10              = _mm_mul_ps(rsq10,rinv10);
1112             r10              = _mm_andnot_ps(dummy_mask,r10);
1113
1114             /* EWALD ELECTROSTATICS */
1115
1116             /* Analytical PME correction */
1117             zeta2            = _mm_mul_ps(beta2,rsq10);
1118             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1119             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1120             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1121             felec            = _mm_mul_ps(qq10,felec);
1122             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1123             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
1124             velec            = _mm_mul_ps(qq10,velec);
1125
1126             d                = _mm_sub_ps(r10,rswitch);
1127             d                = _mm_max_ps(d,_mm_setzero_ps());
1128             d2               = _mm_mul_ps(d,d);
1129             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1130
1131             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1132
1133             /* Evaluate switch function */
1134             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1135             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1136             velec            = _mm_mul_ps(velec,sw);
1137             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1138
1139             /* Update potential sum for this i atom from the interaction with this j atom. */
1140             velec            = _mm_and_ps(velec,cutoff_mask);
1141             velec            = _mm_andnot_ps(dummy_mask,velec);
1142             velecsum         = _mm_add_ps(velecsum,velec);
1143
1144             fscal            = felec;
1145
1146             fscal            = _mm_and_ps(fscal,cutoff_mask);
1147
1148             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1149
1150              /* Update vectorial force */
1151             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1152             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1153             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1154
1155             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1156             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1157             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1158
1159             }
1160
1161             /**************************
1162              * CALCULATE INTERACTIONS *
1163              **************************/
1164
1165             if (gmx_mm_any_lt(rsq11,rcutoff2))
1166             {
1167
1168             r11              = _mm_mul_ps(rsq11,rinv11);
1169             r11              = _mm_andnot_ps(dummy_mask,r11);
1170
1171             /* EWALD ELECTROSTATICS */
1172
1173             /* Analytical PME correction */
1174             zeta2            = _mm_mul_ps(beta2,rsq11);
1175             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1176             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1177             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1178             felec            = _mm_mul_ps(qq11,felec);
1179             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1180             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1181             velec            = _mm_mul_ps(qq11,velec);
1182
1183             d                = _mm_sub_ps(r11,rswitch);
1184             d                = _mm_max_ps(d,_mm_setzero_ps());
1185             d2               = _mm_mul_ps(d,d);
1186             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1187
1188             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1189
1190             /* Evaluate switch function */
1191             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1192             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1193             velec            = _mm_mul_ps(velec,sw);
1194             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1195
1196             /* Update potential sum for this i atom from the interaction with this j atom. */
1197             velec            = _mm_and_ps(velec,cutoff_mask);
1198             velec            = _mm_andnot_ps(dummy_mask,velec);
1199             velecsum         = _mm_add_ps(velecsum,velec);
1200
1201             fscal            = felec;
1202
1203             fscal            = _mm_and_ps(fscal,cutoff_mask);
1204
1205             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1206
1207              /* Update vectorial force */
1208             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1209             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1210             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1211
1212             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1213             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1214             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1215
1216             }
1217
1218             /**************************
1219              * CALCULATE INTERACTIONS *
1220              **************************/
1221
1222             if (gmx_mm_any_lt(rsq12,rcutoff2))
1223             {
1224
1225             r12              = _mm_mul_ps(rsq12,rinv12);
1226             r12              = _mm_andnot_ps(dummy_mask,r12);
1227
1228             /* EWALD ELECTROSTATICS */
1229
1230             /* Analytical PME correction */
1231             zeta2            = _mm_mul_ps(beta2,rsq12);
1232             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1233             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1234             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1235             felec            = _mm_mul_ps(qq12,felec);
1236             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1237             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
1238             velec            = _mm_mul_ps(qq12,velec);
1239
1240             d                = _mm_sub_ps(r12,rswitch);
1241             d                = _mm_max_ps(d,_mm_setzero_ps());
1242             d2               = _mm_mul_ps(d,d);
1243             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1244
1245             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1246
1247             /* Evaluate switch function */
1248             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1249             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1250             velec            = _mm_mul_ps(velec,sw);
1251             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1252
1253             /* Update potential sum for this i atom from the interaction with this j atom. */
1254             velec            = _mm_and_ps(velec,cutoff_mask);
1255             velec            = _mm_andnot_ps(dummy_mask,velec);
1256             velecsum         = _mm_add_ps(velecsum,velec);
1257
1258             fscal            = felec;
1259
1260             fscal            = _mm_and_ps(fscal,cutoff_mask);
1261
1262             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1263
1264              /* Update vectorial force */
1265             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1266             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1267             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1268
1269             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1270             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1271             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1272
1273             }
1274
1275             /**************************
1276              * CALCULATE INTERACTIONS *
1277              **************************/
1278
1279             if (gmx_mm_any_lt(rsq20,rcutoff2))
1280             {
1281
1282             r20              = _mm_mul_ps(rsq20,rinv20);
1283             r20              = _mm_andnot_ps(dummy_mask,r20);
1284
1285             /* EWALD ELECTROSTATICS */
1286
1287             /* Analytical PME correction */
1288             zeta2            = _mm_mul_ps(beta2,rsq20);
1289             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
1290             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1291             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1292             felec            = _mm_mul_ps(qq20,felec);
1293             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1294             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
1295             velec            = _mm_mul_ps(qq20,velec);
1296
1297             d                = _mm_sub_ps(r20,rswitch);
1298             d                = _mm_max_ps(d,_mm_setzero_ps());
1299             d2               = _mm_mul_ps(d,d);
1300             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1301
1302             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1303
1304             /* Evaluate switch function */
1305             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1306             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1307             velec            = _mm_mul_ps(velec,sw);
1308             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1309
1310             /* Update potential sum for this i atom from the interaction with this j atom. */
1311             velec            = _mm_and_ps(velec,cutoff_mask);
1312             velec            = _mm_andnot_ps(dummy_mask,velec);
1313             velecsum         = _mm_add_ps(velecsum,velec);
1314
1315             fscal            = felec;
1316
1317             fscal            = _mm_and_ps(fscal,cutoff_mask);
1318
1319             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1320
1321              /* Update vectorial force */
1322             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1323             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1324             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1325
1326             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1327             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1328             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1329
1330             }
1331
1332             /**************************
1333              * CALCULATE INTERACTIONS *
1334              **************************/
1335
1336             if (gmx_mm_any_lt(rsq21,rcutoff2))
1337             {
1338
1339             r21              = _mm_mul_ps(rsq21,rinv21);
1340             r21              = _mm_andnot_ps(dummy_mask,r21);
1341
1342             /* EWALD ELECTROSTATICS */
1343
1344             /* Analytical PME correction */
1345             zeta2            = _mm_mul_ps(beta2,rsq21);
1346             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1347             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1348             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1349             felec            = _mm_mul_ps(qq21,felec);
1350             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1351             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
1352             velec            = _mm_mul_ps(qq21,velec);
1353
1354             d                = _mm_sub_ps(r21,rswitch);
1355             d                = _mm_max_ps(d,_mm_setzero_ps());
1356             d2               = _mm_mul_ps(d,d);
1357             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1358
1359             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1360
1361             /* Evaluate switch function */
1362             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1363             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1364             velec            = _mm_mul_ps(velec,sw);
1365             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1366
1367             /* Update potential sum for this i atom from the interaction with this j atom. */
1368             velec            = _mm_and_ps(velec,cutoff_mask);
1369             velec            = _mm_andnot_ps(dummy_mask,velec);
1370             velecsum         = _mm_add_ps(velecsum,velec);
1371
1372             fscal            = felec;
1373
1374             fscal            = _mm_and_ps(fscal,cutoff_mask);
1375
1376             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1377
1378              /* Update vectorial force */
1379             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1380             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1381             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1382
1383             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1384             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1385             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1386
1387             }
1388
1389             /**************************
1390              * CALCULATE INTERACTIONS *
1391              **************************/
1392
1393             if (gmx_mm_any_lt(rsq22,rcutoff2))
1394             {
1395
1396             r22              = _mm_mul_ps(rsq22,rinv22);
1397             r22              = _mm_andnot_ps(dummy_mask,r22);
1398
1399             /* EWALD ELECTROSTATICS */
1400
1401             /* Analytical PME correction */
1402             zeta2            = _mm_mul_ps(beta2,rsq22);
1403             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1404             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1405             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1406             felec            = _mm_mul_ps(qq22,felec);
1407             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1408             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1409             velec            = _mm_mul_ps(qq22,velec);
1410
1411             d                = _mm_sub_ps(r22,rswitch);
1412             d                = _mm_max_ps(d,_mm_setzero_ps());
1413             d2               = _mm_mul_ps(d,d);
1414             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1415
1416             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1417
1418             /* Evaluate switch function */
1419             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1420             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1421             velec            = _mm_mul_ps(velec,sw);
1422             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1423
1424             /* Update potential sum for this i atom from the interaction with this j atom. */
1425             velec            = _mm_and_ps(velec,cutoff_mask);
1426             velec            = _mm_andnot_ps(dummy_mask,velec);
1427             velecsum         = _mm_add_ps(velecsum,velec);
1428
1429             fscal            = felec;
1430
1431             fscal            = _mm_and_ps(fscal,cutoff_mask);
1432
1433             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1434
1435              /* Update vectorial force */
1436             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1437             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1438             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1439
1440             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1441             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1442             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1443
1444             }
1445
1446             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1447             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1448             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1449             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1450
1451             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1452                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1453
1454             /* Inner loop uses 504 flops */
1455         }
1456
1457         /* End of innermost loop */
1458
1459         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1460                                               f+i_coord_offset,fshift+i_shift_offset);
1461
1462         ggid                        = gid[iidx];
1463         /* Update potential energies */
1464         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1465         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1466
1467         /* Increment number of inner iterations */
1468         inneriter                  += j_index_end - j_index_start;
1469
1470         /* Outer loop uses 20 flops */
1471     }
1472
1473     /* Increment number of outer iterations */
1474     outeriter        += nri;
1475
1476     /* Update outer/inner flops */
1477
1478     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*504);
1479 }
1480 /*
1481  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_single
1482  * Electrostatics interaction: Ewald
1483  * VdW interaction:            LennardJones
1484  * Geometry:                   Water3-Water3
1485  * Calculate force/pot:        Force
1486  */
1487 void
1488 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_single
1489                     (t_nblist                    * gmx_restrict       nlist,
1490                      rvec                        * gmx_restrict          xx,
1491                      rvec                        * gmx_restrict          ff,
1492                      t_forcerec                  * gmx_restrict          fr,
1493                      t_mdatoms                   * gmx_restrict     mdatoms,
1494                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1495                      t_nrnb                      * gmx_restrict        nrnb)
1496 {
1497     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1498      * just 0 for non-waters.
1499      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1500      * jnr indices corresponding to data put in the four positions in the SIMD register.
1501      */
1502     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1503     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1504     int              jnrA,jnrB,jnrC,jnrD;
1505     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1506     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1507     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1508     real             rcutoff_scalar;
1509     real             *shiftvec,*fshift,*x,*f;
1510     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1511     real             scratch[4*DIM];
1512     __m128           fscal,rcutoff,rcutoff2,jidxall;
1513     int              vdwioffset0;
1514     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1515     int              vdwioffset1;
1516     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1517     int              vdwioffset2;
1518     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1519     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1520     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1521     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1522     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1523     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1524     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1525     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1526     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1527     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1528     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1529     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1530     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1531     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1532     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1533     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1534     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1535     real             *charge;
1536     int              nvdwtype;
1537     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1538     int              *vdwtype;
1539     real             *vdwparam;
1540     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1541     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1542     __m128i          ewitab;
1543     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1544     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1545     real             *ewtab;
1546     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1547     real             rswitch_scalar,d_scalar;
1548     __m128           dummy_mask,cutoff_mask;
1549     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1550     __m128           one     = _mm_set1_ps(1.0);
1551     __m128           two     = _mm_set1_ps(2.0);
1552     x                = xx[0];
1553     f                = ff[0];
1554
1555     nri              = nlist->nri;
1556     iinr             = nlist->iinr;
1557     jindex           = nlist->jindex;
1558     jjnr             = nlist->jjnr;
1559     shiftidx         = nlist->shift;
1560     gid              = nlist->gid;
1561     shiftvec         = fr->shift_vec[0];
1562     fshift           = fr->fshift[0];
1563     facel            = _mm_set1_ps(fr->epsfac);
1564     charge           = mdatoms->chargeA;
1565     nvdwtype         = fr->ntype;
1566     vdwparam         = fr->nbfp;
1567     vdwtype          = mdatoms->typeA;
1568
1569     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1570     beta             = _mm_set1_ps(fr->ic->ewaldcoeff_q);
1571     beta2            = _mm_mul_ps(beta,beta);
1572     beta3            = _mm_mul_ps(beta,beta2);
1573     ewtab            = fr->ic->tabq_coul_FDV0;
1574     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1575     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1576
1577     /* Setup water-specific parameters */
1578     inr              = nlist->iinr[0];
1579     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1580     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1581     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1582     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1583
1584     jq0              = _mm_set1_ps(charge[inr+0]);
1585     jq1              = _mm_set1_ps(charge[inr+1]);
1586     jq2              = _mm_set1_ps(charge[inr+2]);
1587     vdwjidx0A        = 2*vdwtype[inr+0];
1588     qq00             = _mm_mul_ps(iq0,jq0);
1589     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1590     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1591     qq01             = _mm_mul_ps(iq0,jq1);
1592     qq02             = _mm_mul_ps(iq0,jq2);
1593     qq10             = _mm_mul_ps(iq1,jq0);
1594     qq11             = _mm_mul_ps(iq1,jq1);
1595     qq12             = _mm_mul_ps(iq1,jq2);
1596     qq20             = _mm_mul_ps(iq2,jq0);
1597     qq21             = _mm_mul_ps(iq2,jq1);
1598     qq22             = _mm_mul_ps(iq2,jq2);
1599
1600     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1601     rcutoff_scalar   = fr->rcoulomb;
1602     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1603     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1604
1605     rswitch_scalar   = fr->rcoulomb_switch;
1606     rswitch          = _mm_set1_ps(rswitch_scalar);
1607     /* Setup switch parameters */
1608     d_scalar         = rcutoff_scalar-rswitch_scalar;
1609     d                = _mm_set1_ps(d_scalar);
1610     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1611     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1612     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1613     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1614     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1615     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1616
1617     /* Avoid stupid compiler warnings */
1618     jnrA = jnrB = jnrC = jnrD = 0;
1619     j_coord_offsetA = 0;
1620     j_coord_offsetB = 0;
1621     j_coord_offsetC = 0;
1622     j_coord_offsetD = 0;
1623
1624     outeriter        = 0;
1625     inneriter        = 0;
1626
1627     for(iidx=0;iidx<4*DIM;iidx++)
1628     {
1629         scratch[iidx] = 0.0;
1630     }
1631
1632     /* Start outer loop over neighborlists */
1633     for(iidx=0; iidx<nri; iidx++)
1634     {
1635         /* Load shift vector for this list */
1636         i_shift_offset   = DIM*shiftidx[iidx];
1637
1638         /* Load limits for loop over neighbors */
1639         j_index_start    = jindex[iidx];
1640         j_index_end      = jindex[iidx+1];
1641
1642         /* Get outer coordinate index */
1643         inr              = iinr[iidx];
1644         i_coord_offset   = DIM*inr;
1645
1646         /* Load i particle coords and add shift vector */
1647         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1648                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1649
1650         fix0             = _mm_setzero_ps();
1651         fiy0             = _mm_setzero_ps();
1652         fiz0             = _mm_setzero_ps();
1653         fix1             = _mm_setzero_ps();
1654         fiy1             = _mm_setzero_ps();
1655         fiz1             = _mm_setzero_ps();
1656         fix2             = _mm_setzero_ps();
1657         fiy2             = _mm_setzero_ps();
1658         fiz2             = _mm_setzero_ps();
1659
1660         /* Start inner kernel loop */
1661         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1662         {
1663
1664             /* Get j neighbor index, and coordinate index */
1665             jnrA             = jjnr[jidx];
1666             jnrB             = jjnr[jidx+1];
1667             jnrC             = jjnr[jidx+2];
1668             jnrD             = jjnr[jidx+3];
1669             j_coord_offsetA  = DIM*jnrA;
1670             j_coord_offsetB  = DIM*jnrB;
1671             j_coord_offsetC  = DIM*jnrC;
1672             j_coord_offsetD  = DIM*jnrD;
1673
1674             /* load j atom coordinates */
1675             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1676                                               x+j_coord_offsetC,x+j_coord_offsetD,
1677                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1678
1679             /* Calculate displacement vector */
1680             dx00             = _mm_sub_ps(ix0,jx0);
1681             dy00             = _mm_sub_ps(iy0,jy0);
1682             dz00             = _mm_sub_ps(iz0,jz0);
1683             dx01             = _mm_sub_ps(ix0,jx1);
1684             dy01             = _mm_sub_ps(iy0,jy1);
1685             dz01             = _mm_sub_ps(iz0,jz1);
1686             dx02             = _mm_sub_ps(ix0,jx2);
1687             dy02             = _mm_sub_ps(iy0,jy2);
1688             dz02             = _mm_sub_ps(iz0,jz2);
1689             dx10             = _mm_sub_ps(ix1,jx0);
1690             dy10             = _mm_sub_ps(iy1,jy0);
1691             dz10             = _mm_sub_ps(iz1,jz0);
1692             dx11             = _mm_sub_ps(ix1,jx1);
1693             dy11             = _mm_sub_ps(iy1,jy1);
1694             dz11             = _mm_sub_ps(iz1,jz1);
1695             dx12             = _mm_sub_ps(ix1,jx2);
1696             dy12             = _mm_sub_ps(iy1,jy2);
1697             dz12             = _mm_sub_ps(iz1,jz2);
1698             dx20             = _mm_sub_ps(ix2,jx0);
1699             dy20             = _mm_sub_ps(iy2,jy0);
1700             dz20             = _mm_sub_ps(iz2,jz0);
1701             dx21             = _mm_sub_ps(ix2,jx1);
1702             dy21             = _mm_sub_ps(iy2,jy1);
1703             dz21             = _mm_sub_ps(iz2,jz1);
1704             dx22             = _mm_sub_ps(ix2,jx2);
1705             dy22             = _mm_sub_ps(iy2,jy2);
1706             dz22             = _mm_sub_ps(iz2,jz2);
1707
1708             /* Calculate squared distance and things based on it */
1709             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1710             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1711             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1712             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1713             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1714             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1715             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1716             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1717             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1718
1719             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1720             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1721             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1722             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1723             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1724             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1725             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1726             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1727             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1728
1729             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1730             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1731             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1732             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1733             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1734             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1735             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1736             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1737             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1738
1739             fjx0             = _mm_setzero_ps();
1740             fjy0             = _mm_setzero_ps();
1741             fjz0             = _mm_setzero_ps();
1742             fjx1             = _mm_setzero_ps();
1743             fjy1             = _mm_setzero_ps();
1744             fjz1             = _mm_setzero_ps();
1745             fjx2             = _mm_setzero_ps();
1746             fjy2             = _mm_setzero_ps();
1747             fjz2             = _mm_setzero_ps();
1748
1749             /**************************
1750              * CALCULATE INTERACTIONS *
1751              **************************/
1752
1753             if (gmx_mm_any_lt(rsq00,rcutoff2))
1754             {
1755
1756             r00              = _mm_mul_ps(rsq00,rinv00);
1757
1758             /* EWALD ELECTROSTATICS */
1759
1760             /* Analytical PME correction */
1761             zeta2            = _mm_mul_ps(beta2,rsq00);
1762             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
1763             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1764             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1765             felec            = _mm_mul_ps(qq00,felec);
1766             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1767             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
1768             velec            = _mm_mul_ps(qq00,velec);
1769
1770             /* LENNARD-JONES DISPERSION/REPULSION */
1771
1772             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1773             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1774             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1775             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
1776             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1777
1778             d                = _mm_sub_ps(r00,rswitch);
1779             d                = _mm_max_ps(d,_mm_setzero_ps());
1780             d2               = _mm_mul_ps(d,d);
1781             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1782
1783             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1784
1785             /* Evaluate switch function */
1786             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1787             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1788             fvdw             = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1789             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1790
1791             fscal            = _mm_add_ps(felec,fvdw);
1792
1793             fscal            = _mm_and_ps(fscal,cutoff_mask);
1794
1795              /* Update vectorial force */
1796             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1797             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1798             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1799
1800             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1801             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1802             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1803
1804             }
1805
1806             /**************************
1807              * CALCULATE INTERACTIONS *
1808              **************************/
1809
1810             if (gmx_mm_any_lt(rsq01,rcutoff2))
1811             {
1812
1813             r01              = _mm_mul_ps(rsq01,rinv01);
1814
1815             /* EWALD ELECTROSTATICS */
1816
1817             /* Analytical PME correction */
1818             zeta2            = _mm_mul_ps(beta2,rsq01);
1819             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
1820             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1821             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1822             felec            = _mm_mul_ps(qq01,felec);
1823             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1824             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
1825             velec            = _mm_mul_ps(qq01,velec);
1826
1827             d                = _mm_sub_ps(r01,rswitch);
1828             d                = _mm_max_ps(d,_mm_setzero_ps());
1829             d2               = _mm_mul_ps(d,d);
1830             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1831
1832             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1833
1834             /* Evaluate switch function */
1835             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1836             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1837             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1838
1839             fscal            = felec;
1840
1841             fscal            = _mm_and_ps(fscal,cutoff_mask);
1842
1843              /* Update vectorial force */
1844             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1845             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1846             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1847
1848             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1849             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1850             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1851
1852             }
1853
1854             /**************************
1855              * CALCULATE INTERACTIONS *
1856              **************************/
1857
1858             if (gmx_mm_any_lt(rsq02,rcutoff2))
1859             {
1860
1861             r02              = _mm_mul_ps(rsq02,rinv02);
1862
1863             /* EWALD ELECTROSTATICS */
1864
1865             /* Analytical PME correction */
1866             zeta2            = _mm_mul_ps(beta2,rsq02);
1867             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
1868             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1869             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1870             felec            = _mm_mul_ps(qq02,felec);
1871             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1872             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
1873             velec            = _mm_mul_ps(qq02,velec);
1874
1875             d                = _mm_sub_ps(r02,rswitch);
1876             d                = _mm_max_ps(d,_mm_setzero_ps());
1877             d2               = _mm_mul_ps(d,d);
1878             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1879
1880             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1881
1882             /* Evaluate switch function */
1883             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1884             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1885             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1886
1887             fscal            = felec;
1888
1889             fscal            = _mm_and_ps(fscal,cutoff_mask);
1890
1891              /* Update vectorial force */
1892             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1893             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1894             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1895
1896             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1897             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1898             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1899
1900             }
1901
1902             /**************************
1903              * CALCULATE INTERACTIONS *
1904              **************************/
1905
1906             if (gmx_mm_any_lt(rsq10,rcutoff2))
1907             {
1908
1909             r10              = _mm_mul_ps(rsq10,rinv10);
1910
1911             /* EWALD ELECTROSTATICS */
1912
1913             /* Analytical PME correction */
1914             zeta2            = _mm_mul_ps(beta2,rsq10);
1915             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1916             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1917             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1918             felec            = _mm_mul_ps(qq10,felec);
1919             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1920             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
1921             velec            = _mm_mul_ps(qq10,velec);
1922
1923             d                = _mm_sub_ps(r10,rswitch);
1924             d                = _mm_max_ps(d,_mm_setzero_ps());
1925             d2               = _mm_mul_ps(d,d);
1926             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1927
1928             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1929
1930             /* Evaluate switch function */
1931             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1932             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1933             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1934
1935             fscal            = felec;
1936
1937             fscal            = _mm_and_ps(fscal,cutoff_mask);
1938
1939              /* Update vectorial force */
1940             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1941             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1942             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1943
1944             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1945             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1946             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1947
1948             }
1949
1950             /**************************
1951              * CALCULATE INTERACTIONS *
1952              **************************/
1953
1954             if (gmx_mm_any_lt(rsq11,rcutoff2))
1955             {
1956
1957             r11              = _mm_mul_ps(rsq11,rinv11);
1958
1959             /* EWALD ELECTROSTATICS */
1960
1961             /* Analytical PME correction */
1962             zeta2            = _mm_mul_ps(beta2,rsq11);
1963             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1964             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1965             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1966             felec            = _mm_mul_ps(qq11,felec);
1967             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1968             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1969             velec            = _mm_mul_ps(qq11,velec);
1970
1971             d                = _mm_sub_ps(r11,rswitch);
1972             d                = _mm_max_ps(d,_mm_setzero_ps());
1973             d2               = _mm_mul_ps(d,d);
1974             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1975
1976             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1977
1978             /* Evaluate switch function */
1979             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1980             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1981             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1982
1983             fscal            = felec;
1984
1985             fscal            = _mm_and_ps(fscal,cutoff_mask);
1986
1987              /* Update vectorial force */
1988             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1989             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1990             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1991
1992             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1993             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1994             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1995
1996             }
1997
1998             /**************************
1999              * CALCULATE INTERACTIONS *
2000              **************************/
2001
2002             if (gmx_mm_any_lt(rsq12,rcutoff2))
2003             {
2004
2005             r12              = _mm_mul_ps(rsq12,rinv12);
2006
2007             /* EWALD ELECTROSTATICS */
2008
2009             /* Analytical PME correction */
2010             zeta2            = _mm_mul_ps(beta2,rsq12);
2011             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
2012             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2013             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2014             felec            = _mm_mul_ps(qq12,felec);
2015             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2016             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
2017             velec            = _mm_mul_ps(qq12,velec);
2018
2019             d                = _mm_sub_ps(r12,rswitch);
2020             d                = _mm_max_ps(d,_mm_setzero_ps());
2021             d2               = _mm_mul_ps(d,d);
2022             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2023
2024             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2025
2026             /* Evaluate switch function */
2027             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2028             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2029             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2030
2031             fscal            = felec;
2032
2033             fscal            = _mm_and_ps(fscal,cutoff_mask);
2034
2035              /* Update vectorial force */
2036             fix1             = _mm_macc_ps(dx12,fscal,fix1);
2037             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
2038             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
2039
2040             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
2041             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
2042             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
2043
2044             }
2045
2046             /**************************
2047              * CALCULATE INTERACTIONS *
2048              **************************/
2049
2050             if (gmx_mm_any_lt(rsq20,rcutoff2))
2051             {
2052
2053             r20              = _mm_mul_ps(rsq20,rinv20);
2054
2055             /* EWALD ELECTROSTATICS */
2056
2057             /* Analytical PME correction */
2058             zeta2            = _mm_mul_ps(beta2,rsq20);
2059             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
2060             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2061             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2062             felec            = _mm_mul_ps(qq20,felec);
2063             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2064             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
2065             velec            = _mm_mul_ps(qq20,velec);
2066
2067             d                = _mm_sub_ps(r20,rswitch);
2068             d                = _mm_max_ps(d,_mm_setzero_ps());
2069             d2               = _mm_mul_ps(d,d);
2070             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2071
2072             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2073
2074             /* Evaluate switch function */
2075             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2076             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2077             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2078
2079             fscal            = felec;
2080
2081             fscal            = _mm_and_ps(fscal,cutoff_mask);
2082
2083              /* Update vectorial force */
2084             fix2             = _mm_macc_ps(dx20,fscal,fix2);
2085             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
2086             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
2087
2088             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
2089             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
2090             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
2091
2092             }
2093
2094             /**************************
2095              * CALCULATE INTERACTIONS *
2096              **************************/
2097
2098             if (gmx_mm_any_lt(rsq21,rcutoff2))
2099             {
2100
2101             r21              = _mm_mul_ps(rsq21,rinv21);
2102
2103             /* EWALD ELECTROSTATICS */
2104
2105             /* Analytical PME correction */
2106             zeta2            = _mm_mul_ps(beta2,rsq21);
2107             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
2108             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2109             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2110             felec            = _mm_mul_ps(qq21,felec);
2111             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2112             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2113             velec            = _mm_mul_ps(qq21,velec);
2114
2115             d                = _mm_sub_ps(r21,rswitch);
2116             d                = _mm_max_ps(d,_mm_setzero_ps());
2117             d2               = _mm_mul_ps(d,d);
2118             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2119
2120             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2121
2122             /* Evaluate switch function */
2123             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2124             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2125             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2126
2127             fscal            = felec;
2128
2129             fscal            = _mm_and_ps(fscal,cutoff_mask);
2130
2131              /* Update vectorial force */
2132             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2133             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2134             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2135
2136             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2137             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2138             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2139
2140             }
2141
2142             /**************************
2143              * CALCULATE INTERACTIONS *
2144              **************************/
2145
2146             if (gmx_mm_any_lt(rsq22,rcutoff2))
2147             {
2148
2149             r22              = _mm_mul_ps(rsq22,rinv22);
2150
2151             /* EWALD ELECTROSTATICS */
2152
2153             /* Analytical PME correction */
2154             zeta2            = _mm_mul_ps(beta2,rsq22);
2155             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
2156             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2157             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2158             felec            = _mm_mul_ps(qq22,felec);
2159             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2160             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2161             velec            = _mm_mul_ps(qq22,velec);
2162
2163             d                = _mm_sub_ps(r22,rswitch);
2164             d                = _mm_max_ps(d,_mm_setzero_ps());
2165             d2               = _mm_mul_ps(d,d);
2166             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2167
2168             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2169
2170             /* Evaluate switch function */
2171             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2172             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2173             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2174
2175             fscal            = felec;
2176
2177             fscal            = _mm_and_ps(fscal,cutoff_mask);
2178
2179              /* Update vectorial force */
2180             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2181             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2182             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2183
2184             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2185             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2186             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2187
2188             }
2189
2190             fjptrA             = f+j_coord_offsetA;
2191             fjptrB             = f+j_coord_offsetB;
2192             fjptrC             = f+j_coord_offsetC;
2193             fjptrD             = f+j_coord_offsetD;
2194
2195             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2196                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2197
2198             /* Inner loop uses 465 flops */
2199         }
2200
2201         if(jidx<j_index_end)
2202         {
2203
2204             /* Get j neighbor index, and coordinate index */
2205             jnrlistA         = jjnr[jidx];
2206             jnrlistB         = jjnr[jidx+1];
2207             jnrlistC         = jjnr[jidx+2];
2208             jnrlistD         = jjnr[jidx+3];
2209             /* Sign of each element will be negative for non-real atoms.
2210              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2211              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2212              */
2213             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2214             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2215             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2216             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2217             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2218             j_coord_offsetA  = DIM*jnrA;
2219             j_coord_offsetB  = DIM*jnrB;
2220             j_coord_offsetC  = DIM*jnrC;
2221             j_coord_offsetD  = DIM*jnrD;
2222
2223             /* load j atom coordinates */
2224             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2225                                               x+j_coord_offsetC,x+j_coord_offsetD,
2226                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2227
2228             /* Calculate displacement vector */
2229             dx00             = _mm_sub_ps(ix0,jx0);
2230             dy00             = _mm_sub_ps(iy0,jy0);
2231             dz00             = _mm_sub_ps(iz0,jz0);
2232             dx01             = _mm_sub_ps(ix0,jx1);
2233             dy01             = _mm_sub_ps(iy0,jy1);
2234             dz01             = _mm_sub_ps(iz0,jz1);
2235             dx02             = _mm_sub_ps(ix0,jx2);
2236             dy02             = _mm_sub_ps(iy0,jy2);
2237             dz02             = _mm_sub_ps(iz0,jz2);
2238             dx10             = _mm_sub_ps(ix1,jx0);
2239             dy10             = _mm_sub_ps(iy1,jy0);
2240             dz10             = _mm_sub_ps(iz1,jz0);
2241             dx11             = _mm_sub_ps(ix1,jx1);
2242             dy11             = _mm_sub_ps(iy1,jy1);
2243             dz11             = _mm_sub_ps(iz1,jz1);
2244             dx12             = _mm_sub_ps(ix1,jx2);
2245             dy12             = _mm_sub_ps(iy1,jy2);
2246             dz12             = _mm_sub_ps(iz1,jz2);
2247             dx20             = _mm_sub_ps(ix2,jx0);
2248             dy20             = _mm_sub_ps(iy2,jy0);
2249             dz20             = _mm_sub_ps(iz2,jz0);
2250             dx21             = _mm_sub_ps(ix2,jx1);
2251             dy21             = _mm_sub_ps(iy2,jy1);
2252             dz21             = _mm_sub_ps(iz2,jz1);
2253             dx22             = _mm_sub_ps(ix2,jx2);
2254             dy22             = _mm_sub_ps(iy2,jy2);
2255             dz22             = _mm_sub_ps(iz2,jz2);
2256
2257             /* Calculate squared distance and things based on it */
2258             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2259             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2260             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2261             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2262             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2263             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2264             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2265             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2266             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2267
2268             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2269             rinv01           = gmx_mm_invsqrt_ps(rsq01);
2270             rinv02           = gmx_mm_invsqrt_ps(rsq02);
2271             rinv10           = gmx_mm_invsqrt_ps(rsq10);
2272             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2273             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2274             rinv20           = gmx_mm_invsqrt_ps(rsq20);
2275             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2276             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2277
2278             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2279             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
2280             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
2281             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
2282             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2283             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2284             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
2285             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2286             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2287
2288             fjx0             = _mm_setzero_ps();
2289             fjy0             = _mm_setzero_ps();
2290             fjz0             = _mm_setzero_ps();
2291             fjx1             = _mm_setzero_ps();
2292             fjy1             = _mm_setzero_ps();
2293             fjz1             = _mm_setzero_ps();
2294             fjx2             = _mm_setzero_ps();
2295             fjy2             = _mm_setzero_ps();
2296             fjz2             = _mm_setzero_ps();
2297
2298             /**************************
2299              * CALCULATE INTERACTIONS *
2300              **************************/
2301
2302             if (gmx_mm_any_lt(rsq00,rcutoff2))
2303             {
2304
2305             r00              = _mm_mul_ps(rsq00,rinv00);
2306             r00              = _mm_andnot_ps(dummy_mask,r00);
2307
2308             /* EWALD ELECTROSTATICS */
2309
2310             /* Analytical PME correction */
2311             zeta2            = _mm_mul_ps(beta2,rsq00);
2312             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
2313             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2314             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2315             felec            = _mm_mul_ps(qq00,felec);
2316             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2317             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
2318             velec            = _mm_mul_ps(qq00,velec);
2319
2320             /* LENNARD-JONES DISPERSION/REPULSION */
2321
2322             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2323             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
2324             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2325             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
2326             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2327
2328             d                = _mm_sub_ps(r00,rswitch);
2329             d                = _mm_max_ps(d,_mm_setzero_ps());
2330             d2               = _mm_mul_ps(d,d);
2331             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2332
2333             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2334
2335             /* Evaluate switch function */
2336             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2337             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2338             fvdw             = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2339             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2340
2341             fscal            = _mm_add_ps(felec,fvdw);
2342
2343             fscal            = _mm_and_ps(fscal,cutoff_mask);
2344
2345             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2346
2347              /* Update vectorial force */
2348             fix0             = _mm_macc_ps(dx00,fscal,fix0);
2349             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
2350             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
2351
2352             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
2353             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
2354             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
2355
2356             }
2357
2358             /**************************
2359              * CALCULATE INTERACTIONS *
2360              **************************/
2361
2362             if (gmx_mm_any_lt(rsq01,rcutoff2))
2363             {
2364
2365             r01              = _mm_mul_ps(rsq01,rinv01);
2366             r01              = _mm_andnot_ps(dummy_mask,r01);
2367
2368             /* EWALD ELECTROSTATICS */
2369
2370             /* Analytical PME correction */
2371             zeta2            = _mm_mul_ps(beta2,rsq01);
2372             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
2373             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2374             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2375             felec            = _mm_mul_ps(qq01,felec);
2376             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2377             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
2378             velec            = _mm_mul_ps(qq01,velec);
2379
2380             d                = _mm_sub_ps(r01,rswitch);
2381             d                = _mm_max_ps(d,_mm_setzero_ps());
2382             d2               = _mm_mul_ps(d,d);
2383             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2384
2385             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2386
2387             /* Evaluate switch function */
2388             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2389             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2390             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
2391
2392             fscal            = felec;
2393
2394             fscal            = _mm_and_ps(fscal,cutoff_mask);
2395
2396             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2397
2398              /* Update vectorial force */
2399             fix0             = _mm_macc_ps(dx01,fscal,fix0);
2400             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
2401             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
2402
2403             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
2404             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
2405             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
2406
2407             }
2408
2409             /**************************
2410              * CALCULATE INTERACTIONS *
2411              **************************/
2412
2413             if (gmx_mm_any_lt(rsq02,rcutoff2))
2414             {
2415
2416             r02              = _mm_mul_ps(rsq02,rinv02);
2417             r02              = _mm_andnot_ps(dummy_mask,r02);
2418
2419             /* EWALD ELECTROSTATICS */
2420
2421             /* Analytical PME correction */
2422             zeta2            = _mm_mul_ps(beta2,rsq02);
2423             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
2424             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2425             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2426             felec            = _mm_mul_ps(qq02,felec);
2427             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2428             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
2429             velec            = _mm_mul_ps(qq02,velec);
2430
2431             d                = _mm_sub_ps(r02,rswitch);
2432             d                = _mm_max_ps(d,_mm_setzero_ps());
2433             d2               = _mm_mul_ps(d,d);
2434             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2435
2436             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2437
2438             /* Evaluate switch function */
2439             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2440             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2441             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
2442
2443             fscal            = felec;
2444
2445             fscal            = _mm_and_ps(fscal,cutoff_mask);
2446
2447             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2448
2449              /* Update vectorial force */
2450             fix0             = _mm_macc_ps(dx02,fscal,fix0);
2451             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
2452             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
2453
2454             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
2455             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
2456             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
2457
2458             }
2459
2460             /**************************
2461              * CALCULATE INTERACTIONS *
2462              **************************/
2463
2464             if (gmx_mm_any_lt(rsq10,rcutoff2))
2465             {
2466
2467             r10              = _mm_mul_ps(rsq10,rinv10);
2468             r10              = _mm_andnot_ps(dummy_mask,r10);
2469
2470             /* EWALD ELECTROSTATICS */
2471
2472             /* Analytical PME correction */
2473             zeta2            = _mm_mul_ps(beta2,rsq10);
2474             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
2475             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2476             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2477             felec            = _mm_mul_ps(qq10,felec);
2478             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2479             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
2480             velec            = _mm_mul_ps(qq10,velec);
2481
2482             d                = _mm_sub_ps(r10,rswitch);
2483             d                = _mm_max_ps(d,_mm_setzero_ps());
2484             d2               = _mm_mul_ps(d,d);
2485             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2486
2487             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2488
2489             /* Evaluate switch function */
2490             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2491             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2492             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
2493
2494             fscal            = felec;
2495
2496             fscal            = _mm_and_ps(fscal,cutoff_mask);
2497
2498             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2499
2500              /* Update vectorial force */
2501             fix1             = _mm_macc_ps(dx10,fscal,fix1);
2502             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
2503             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
2504
2505             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
2506             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
2507             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
2508
2509             }
2510
2511             /**************************
2512              * CALCULATE INTERACTIONS *
2513              **************************/
2514
2515             if (gmx_mm_any_lt(rsq11,rcutoff2))
2516             {
2517
2518             r11              = _mm_mul_ps(rsq11,rinv11);
2519             r11              = _mm_andnot_ps(dummy_mask,r11);
2520
2521             /* EWALD ELECTROSTATICS */
2522
2523             /* Analytical PME correction */
2524             zeta2            = _mm_mul_ps(beta2,rsq11);
2525             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
2526             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2527             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2528             felec            = _mm_mul_ps(qq11,felec);
2529             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2530             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
2531             velec            = _mm_mul_ps(qq11,velec);
2532
2533             d                = _mm_sub_ps(r11,rswitch);
2534             d                = _mm_max_ps(d,_mm_setzero_ps());
2535             d2               = _mm_mul_ps(d,d);
2536             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2537
2538             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2539
2540             /* Evaluate switch function */
2541             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2542             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2543             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2544
2545             fscal            = felec;
2546
2547             fscal            = _mm_and_ps(fscal,cutoff_mask);
2548
2549             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2550
2551              /* Update vectorial force */
2552             fix1             = _mm_macc_ps(dx11,fscal,fix1);
2553             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
2554             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
2555
2556             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
2557             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
2558             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
2559
2560             }
2561
2562             /**************************
2563              * CALCULATE INTERACTIONS *
2564              **************************/
2565
2566             if (gmx_mm_any_lt(rsq12,rcutoff2))
2567             {
2568
2569             r12              = _mm_mul_ps(rsq12,rinv12);
2570             r12              = _mm_andnot_ps(dummy_mask,r12);
2571
2572             /* EWALD ELECTROSTATICS */
2573
2574             /* Analytical PME correction */
2575             zeta2            = _mm_mul_ps(beta2,rsq12);
2576             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
2577             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2578             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2579             felec            = _mm_mul_ps(qq12,felec);
2580             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2581             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
2582             velec            = _mm_mul_ps(qq12,velec);
2583
2584             d                = _mm_sub_ps(r12,rswitch);
2585             d                = _mm_max_ps(d,_mm_setzero_ps());
2586             d2               = _mm_mul_ps(d,d);
2587             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2588
2589             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2590
2591             /* Evaluate switch function */
2592             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2593             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2594             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2595
2596             fscal            = felec;
2597
2598             fscal            = _mm_and_ps(fscal,cutoff_mask);
2599
2600             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2601
2602              /* Update vectorial force */
2603             fix1             = _mm_macc_ps(dx12,fscal,fix1);
2604             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
2605             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
2606
2607             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
2608             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
2609             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
2610
2611             }
2612
2613             /**************************
2614              * CALCULATE INTERACTIONS *
2615              **************************/
2616
2617             if (gmx_mm_any_lt(rsq20,rcutoff2))
2618             {
2619
2620             r20              = _mm_mul_ps(rsq20,rinv20);
2621             r20              = _mm_andnot_ps(dummy_mask,r20);
2622
2623             /* EWALD ELECTROSTATICS */
2624
2625             /* Analytical PME correction */
2626             zeta2            = _mm_mul_ps(beta2,rsq20);
2627             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
2628             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2629             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2630             felec            = _mm_mul_ps(qq20,felec);
2631             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2632             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
2633             velec            = _mm_mul_ps(qq20,velec);
2634
2635             d                = _mm_sub_ps(r20,rswitch);
2636             d                = _mm_max_ps(d,_mm_setzero_ps());
2637             d2               = _mm_mul_ps(d,d);
2638             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2639
2640             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2641
2642             /* Evaluate switch function */
2643             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2644             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2645             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2646
2647             fscal            = felec;
2648
2649             fscal            = _mm_and_ps(fscal,cutoff_mask);
2650
2651             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2652
2653              /* Update vectorial force */
2654             fix2             = _mm_macc_ps(dx20,fscal,fix2);
2655             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
2656             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
2657
2658             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
2659             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
2660             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
2661
2662             }
2663
2664             /**************************
2665              * CALCULATE INTERACTIONS *
2666              **************************/
2667
2668             if (gmx_mm_any_lt(rsq21,rcutoff2))
2669             {
2670
2671             r21              = _mm_mul_ps(rsq21,rinv21);
2672             r21              = _mm_andnot_ps(dummy_mask,r21);
2673
2674             /* EWALD ELECTROSTATICS */
2675
2676             /* Analytical PME correction */
2677             zeta2            = _mm_mul_ps(beta2,rsq21);
2678             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
2679             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2680             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2681             felec            = _mm_mul_ps(qq21,felec);
2682             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2683             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2684             velec            = _mm_mul_ps(qq21,velec);
2685
2686             d                = _mm_sub_ps(r21,rswitch);
2687             d                = _mm_max_ps(d,_mm_setzero_ps());
2688             d2               = _mm_mul_ps(d,d);
2689             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2690
2691             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2692
2693             /* Evaluate switch function */
2694             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2695             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2696             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2697
2698             fscal            = felec;
2699
2700             fscal            = _mm_and_ps(fscal,cutoff_mask);
2701
2702             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2703
2704              /* Update vectorial force */
2705             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2706             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2707             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2708
2709             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2710             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2711             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2712
2713             }
2714
2715             /**************************
2716              * CALCULATE INTERACTIONS *
2717              **************************/
2718
2719             if (gmx_mm_any_lt(rsq22,rcutoff2))
2720             {
2721
2722             r22              = _mm_mul_ps(rsq22,rinv22);
2723             r22              = _mm_andnot_ps(dummy_mask,r22);
2724
2725             /* EWALD ELECTROSTATICS */
2726
2727             /* Analytical PME correction */
2728             zeta2            = _mm_mul_ps(beta2,rsq22);
2729             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
2730             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2731             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2732             felec            = _mm_mul_ps(qq22,felec);
2733             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2734             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2735             velec            = _mm_mul_ps(qq22,velec);
2736
2737             d                = _mm_sub_ps(r22,rswitch);
2738             d                = _mm_max_ps(d,_mm_setzero_ps());
2739             d2               = _mm_mul_ps(d,d);
2740             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2741
2742             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2743
2744             /* Evaluate switch function */
2745             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2746             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2747             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2748
2749             fscal            = felec;
2750
2751             fscal            = _mm_and_ps(fscal,cutoff_mask);
2752
2753             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2754
2755              /* Update vectorial force */
2756             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2757             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2758             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2759
2760             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2761             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2762             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2763
2764             }
2765
2766             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2767             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2768             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2769             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2770
2771             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2772                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2773
2774             /* Inner loop uses 474 flops */
2775         }
2776
2777         /* End of innermost loop */
2778
2779         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2780                                               f+i_coord_offset,fshift+i_shift_offset);
2781
2782         /* Increment number of inner iterations */
2783         inneriter                  += j_index_end - j_index_start;
2784
2785         /* Outer loop uses 18 flops */
2786     }
2787
2788     /* Increment number of outer iterations */
2789     outeriter        += nri;
2790
2791     /* Update outer/inner flops */
2792
2793     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*474);
2794 }