made errors during GPU detection non-fatal
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_avx_128_fma_single.c
1 /*
2  * Note: this file was generated by the Gromacs avx_128_fma_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_128_fma_single.h"
34 #include "kernelutil_x86_avx_128_fma_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_128_fma_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
64     real             rcutoff_scalar;
65     real             *shiftvec,*fshift,*x,*f;
66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
67     real             scratch[4*DIM];
68     __m128           fscal,rcutoff,rcutoff2,jidxall;
69     int              vdwioffset0;
70     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
71     int              vdwioffset1;
72     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
73     int              vdwioffset2;
74     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
91     real             *charge;
92     int              nvdwtype;
93     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
94     int              *vdwtype;
95     real             *vdwparam;
96     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
97     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
98     __m128i          ewitab;
99     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
100     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
101     real             *ewtab;
102     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
103     real             rswitch_scalar,d_scalar;
104     __m128           dummy_mask,cutoff_mask;
105     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
106     __m128           one     = _mm_set1_ps(1.0);
107     __m128           two     = _mm_set1_ps(2.0);
108     x                = xx[0];
109     f                = ff[0];
110
111     nri              = nlist->nri;
112     iinr             = nlist->iinr;
113     jindex           = nlist->jindex;
114     jjnr             = nlist->jjnr;
115     shiftidx         = nlist->shift;
116     gid              = nlist->gid;
117     shiftvec         = fr->shift_vec[0];
118     fshift           = fr->fshift[0];
119     facel            = _mm_set1_ps(fr->epsfac);
120     charge           = mdatoms->chargeA;
121     nvdwtype         = fr->ntype;
122     vdwparam         = fr->nbfp;
123     vdwtype          = mdatoms->typeA;
124
125     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
126     beta             = _mm_set1_ps(fr->ic->ewaldcoeff);
127     beta2            = _mm_mul_ps(beta,beta);
128     beta3            = _mm_mul_ps(beta,beta2);
129     ewtab            = fr->ic->tabq_coul_FDV0;
130     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
131     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
132
133     /* Setup water-specific parameters */
134     inr              = nlist->iinr[0];
135     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
136     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
137     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
138     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
139
140     jq0              = _mm_set1_ps(charge[inr+0]);
141     jq1              = _mm_set1_ps(charge[inr+1]);
142     jq2              = _mm_set1_ps(charge[inr+2]);
143     vdwjidx0A        = 2*vdwtype[inr+0];
144     qq00             = _mm_mul_ps(iq0,jq0);
145     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
146     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
147     qq01             = _mm_mul_ps(iq0,jq1);
148     qq02             = _mm_mul_ps(iq0,jq2);
149     qq10             = _mm_mul_ps(iq1,jq0);
150     qq11             = _mm_mul_ps(iq1,jq1);
151     qq12             = _mm_mul_ps(iq1,jq2);
152     qq20             = _mm_mul_ps(iq2,jq0);
153     qq21             = _mm_mul_ps(iq2,jq1);
154     qq22             = _mm_mul_ps(iq2,jq2);
155
156     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
157     rcutoff_scalar   = fr->rcoulomb;
158     rcutoff          = _mm_set1_ps(rcutoff_scalar);
159     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
160
161     rswitch_scalar   = fr->rcoulomb_switch;
162     rswitch          = _mm_set1_ps(rswitch_scalar);
163     /* Setup switch parameters */
164     d_scalar         = rcutoff_scalar-rswitch_scalar;
165     d                = _mm_set1_ps(d_scalar);
166     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
167     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
168     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
169     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
170     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
171     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
172
173     /* Avoid stupid compiler warnings */
174     jnrA = jnrB = jnrC = jnrD = 0;
175     j_coord_offsetA = 0;
176     j_coord_offsetB = 0;
177     j_coord_offsetC = 0;
178     j_coord_offsetD = 0;
179
180     outeriter        = 0;
181     inneriter        = 0;
182
183     for(iidx=0;iidx<4*DIM;iidx++)
184     {
185         scratch[iidx] = 0.0;
186     }
187
188     /* Start outer loop over neighborlists */
189     for(iidx=0; iidx<nri; iidx++)
190     {
191         /* Load shift vector for this list */
192         i_shift_offset   = DIM*shiftidx[iidx];
193
194         /* Load limits for loop over neighbors */
195         j_index_start    = jindex[iidx];
196         j_index_end      = jindex[iidx+1];
197
198         /* Get outer coordinate index */
199         inr              = iinr[iidx];
200         i_coord_offset   = DIM*inr;
201
202         /* Load i particle coords and add shift vector */
203         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
204                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
205
206         fix0             = _mm_setzero_ps();
207         fiy0             = _mm_setzero_ps();
208         fiz0             = _mm_setzero_ps();
209         fix1             = _mm_setzero_ps();
210         fiy1             = _mm_setzero_ps();
211         fiz1             = _mm_setzero_ps();
212         fix2             = _mm_setzero_ps();
213         fiy2             = _mm_setzero_ps();
214         fiz2             = _mm_setzero_ps();
215
216         /* Reset potential sums */
217         velecsum         = _mm_setzero_ps();
218         vvdwsum          = _mm_setzero_ps();
219
220         /* Start inner kernel loop */
221         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
222         {
223
224             /* Get j neighbor index, and coordinate index */
225             jnrA             = jjnr[jidx];
226             jnrB             = jjnr[jidx+1];
227             jnrC             = jjnr[jidx+2];
228             jnrD             = jjnr[jidx+3];
229             j_coord_offsetA  = DIM*jnrA;
230             j_coord_offsetB  = DIM*jnrB;
231             j_coord_offsetC  = DIM*jnrC;
232             j_coord_offsetD  = DIM*jnrD;
233
234             /* load j atom coordinates */
235             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
236                                               x+j_coord_offsetC,x+j_coord_offsetD,
237                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
238
239             /* Calculate displacement vector */
240             dx00             = _mm_sub_ps(ix0,jx0);
241             dy00             = _mm_sub_ps(iy0,jy0);
242             dz00             = _mm_sub_ps(iz0,jz0);
243             dx01             = _mm_sub_ps(ix0,jx1);
244             dy01             = _mm_sub_ps(iy0,jy1);
245             dz01             = _mm_sub_ps(iz0,jz1);
246             dx02             = _mm_sub_ps(ix0,jx2);
247             dy02             = _mm_sub_ps(iy0,jy2);
248             dz02             = _mm_sub_ps(iz0,jz2);
249             dx10             = _mm_sub_ps(ix1,jx0);
250             dy10             = _mm_sub_ps(iy1,jy0);
251             dz10             = _mm_sub_ps(iz1,jz0);
252             dx11             = _mm_sub_ps(ix1,jx1);
253             dy11             = _mm_sub_ps(iy1,jy1);
254             dz11             = _mm_sub_ps(iz1,jz1);
255             dx12             = _mm_sub_ps(ix1,jx2);
256             dy12             = _mm_sub_ps(iy1,jy2);
257             dz12             = _mm_sub_ps(iz1,jz2);
258             dx20             = _mm_sub_ps(ix2,jx0);
259             dy20             = _mm_sub_ps(iy2,jy0);
260             dz20             = _mm_sub_ps(iz2,jz0);
261             dx21             = _mm_sub_ps(ix2,jx1);
262             dy21             = _mm_sub_ps(iy2,jy1);
263             dz21             = _mm_sub_ps(iz2,jz1);
264             dx22             = _mm_sub_ps(ix2,jx2);
265             dy22             = _mm_sub_ps(iy2,jy2);
266             dz22             = _mm_sub_ps(iz2,jz2);
267
268             /* Calculate squared distance and things based on it */
269             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
270             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
271             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
272             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
273             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
274             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
275             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
276             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
277             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
278
279             rinv00           = gmx_mm_invsqrt_ps(rsq00);
280             rinv01           = gmx_mm_invsqrt_ps(rsq01);
281             rinv02           = gmx_mm_invsqrt_ps(rsq02);
282             rinv10           = gmx_mm_invsqrt_ps(rsq10);
283             rinv11           = gmx_mm_invsqrt_ps(rsq11);
284             rinv12           = gmx_mm_invsqrt_ps(rsq12);
285             rinv20           = gmx_mm_invsqrt_ps(rsq20);
286             rinv21           = gmx_mm_invsqrt_ps(rsq21);
287             rinv22           = gmx_mm_invsqrt_ps(rsq22);
288
289             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
290             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
291             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
292             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
293             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
294             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
295             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
296             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
297             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
298
299             fjx0             = _mm_setzero_ps();
300             fjy0             = _mm_setzero_ps();
301             fjz0             = _mm_setzero_ps();
302             fjx1             = _mm_setzero_ps();
303             fjy1             = _mm_setzero_ps();
304             fjz1             = _mm_setzero_ps();
305             fjx2             = _mm_setzero_ps();
306             fjy2             = _mm_setzero_ps();
307             fjz2             = _mm_setzero_ps();
308
309             /**************************
310              * CALCULATE INTERACTIONS *
311              **************************/
312
313             if (gmx_mm_any_lt(rsq00,rcutoff2))
314             {
315
316             r00              = _mm_mul_ps(rsq00,rinv00);
317
318             /* EWALD ELECTROSTATICS */
319
320             /* Analytical PME correction */
321             zeta2            = _mm_mul_ps(beta2,rsq00);
322             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
323             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
324             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
325             felec            = _mm_mul_ps(qq00,felec);
326             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
327             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
328             velec            = _mm_mul_ps(qq00,velec);
329
330             /* LENNARD-JONES DISPERSION/REPULSION */
331
332             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
333             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
334             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
335             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
336             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
337
338             d                = _mm_sub_ps(r00,rswitch);
339             d                = _mm_max_ps(d,_mm_setzero_ps());
340             d2               = _mm_mul_ps(d,d);
341             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
342
343             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
344
345             /* Evaluate switch function */
346             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
347             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
348             fvdw             = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
349             velec            = _mm_mul_ps(velec,sw);
350             vvdw             = _mm_mul_ps(vvdw,sw);
351             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
352
353             /* Update potential sum for this i atom from the interaction with this j atom. */
354             velec            = _mm_and_ps(velec,cutoff_mask);
355             velecsum         = _mm_add_ps(velecsum,velec);
356             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
357             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
358
359             fscal            = _mm_add_ps(felec,fvdw);
360
361             fscal            = _mm_and_ps(fscal,cutoff_mask);
362
363              /* Update vectorial force */
364             fix0             = _mm_macc_ps(dx00,fscal,fix0);
365             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
366             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
367
368             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
369             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
370             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
371
372             }
373
374             /**************************
375              * CALCULATE INTERACTIONS *
376              **************************/
377
378             if (gmx_mm_any_lt(rsq01,rcutoff2))
379             {
380
381             r01              = _mm_mul_ps(rsq01,rinv01);
382
383             /* EWALD ELECTROSTATICS */
384
385             /* Analytical PME correction */
386             zeta2            = _mm_mul_ps(beta2,rsq01);
387             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
388             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
389             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
390             felec            = _mm_mul_ps(qq01,felec);
391             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
392             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
393             velec            = _mm_mul_ps(qq01,velec);
394
395             d                = _mm_sub_ps(r01,rswitch);
396             d                = _mm_max_ps(d,_mm_setzero_ps());
397             d2               = _mm_mul_ps(d,d);
398             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
399
400             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
401
402             /* Evaluate switch function */
403             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
404             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
405             velec            = _mm_mul_ps(velec,sw);
406             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
407
408             /* Update potential sum for this i atom from the interaction with this j atom. */
409             velec            = _mm_and_ps(velec,cutoff_mask);
410             velecsum         = _mm_add_ps(velecsum,velec);
411
412             fscal            = felec;
413
414             fscal            = _mm_and_ps(fscal,cutoff_mask);
415
416              /* Update vectorial force */
417             fix0             = _mm_macc_ps(dx01,fscal,fix0);
418             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
419             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
420
421             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
422             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
423             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
424
425             }
426
427             /**************************
428              * CALCULATE INTERACTIONS *
429              **************************/
430
431             if (gmx_mm_any_lt(rsq02,rcutoff2))
432             {
433
434             r02              = _mm_mul_ps(rsq02,rinv02);
435
436             /* EWALD ELECTROSTATICS */
437
438             /* Analytical PME correction */
439             zeta2            = _mm_mul_ps(beta2,rsq02);
440             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
441             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
442             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
443             felec            = _mm_mul_ps(qq02,felec);
444             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
445             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
446             velec            = _mm_mul_ps(qq02,velec);
447
448             d                = _mm_sub_ps(r02,rswitch);
449             d                = _mm_max_ps(d,_mm_setzero_ps());
450             d2               = _mm_mul_ps(d,d);
451             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
452
453             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
454
455             /* Evaluate switch function */
456             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
457             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
458             velec            = _mm_mul_ps(velec,sw);
459             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
460
461             /* Update potential sum for this i atom from the interaction with this j atom. */
462             velec            = _mm_and_ps(velec,cutoff_mask);
463             velecsum         = _mm_add_ps(velecsum,velec);
464
465             fscal            = felec;
466
467             fscal            = _mm_and_ps(fscal,cutoff_mask);
468
469              /* Update vectorial force */
470             fix0             = _mm_macc_ps(dx02,fscal,fix0);
471             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
472             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
473
474             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
475             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
476             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
477
478             }
479
480             /**************************
481              * CALCULATE INTERACTIONS *
482              **************************/
483
484             if (gmx_mm_any_lt(rsq10,rcutoff2))
485             {
486
487             r10              = _mm_mul_ps(rsq10,rinv10);
488
489             /* EWALD ELECTROSTATICS */
490
491             /* Analytical PME correction */
492             zeta2            = _mm_mul_ps(beta2,rsq10);
493             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
494             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
495             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
496             felec            = _mm_mul_ps(qq10,felec);
497             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
498             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
499             velec            = _mm_mul_ps(qq10,velec);
500
501             d                = _mm_sub_ps(r10,rswitch);
502             d                = _mm_max_ps(d,_mm_setzero_ps());
503             d2               = _mm_mul_ps(d,d);
504             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
505
506             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
507
508             /* Evaluate switch function */
509             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
510             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
511             velec            = _mm_mul_ps(velec,sw);
512             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
513
514             /* Update potential sum for this i atom from the interaction with this j atom. */
515             velec            = _mm_and_ps(velec,cutoff_mask);
516             velecsum         = _mm_add_ps(velecsum,velec);
517
518             fscal            = felec;
519
520             fscal            = _mm_and_ps(fscal,cutoff_mask);
521
522              /* Update vectorial force */
523             fix1             = _mm_macc_ps(dx10,fscal,fix1);
524             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
525             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
526
527             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
528             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
529             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
530
531             }
532
533             /**************************
534              * CALCULATE INTERACTIONS *
535              **************************/
536
537             if (gmx_mm_any_lt(rsq11,rcutoff2))
538             {
539
540             r11              = _mm_mul_ps(rsq11,rinv11);
541
542             /* EWALD ELECTROSTATICS */
543
544             /* Analytical PME correction */
545             zeta2            = _mm_mul_ps(beta2,rsq11);
546             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
547             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
548             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
549             felec            = _mm_mul_ps(qq11,felec);
550             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
551             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
552             velec            = _mm_mul_ps(qq11,velec);
553
554             d                = _mm_sub_ps(r11,rswitch);
555             d                = _mm_max_ps(d,_mm_setzero_ps());
556             d2               = _mm_mul_ps(d,d);
557             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
558
559             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
560
561             /* Evaluate switch function */
562             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
563             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
564             velec            = _mm_mul_ps(velec,sw);
565             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
566
567             /* Update potential sum for this i atom from the interaction with this j atom. */
568             velec            = _mm_and_ps(velec,cutoff_mask);
569             velecsum         = _mm_add_ps(velecsum,velec);
570
571             fscal            = felec;
572
573             fscal            = _mm_and_ps(fscal,cutoff_mask);
574
575              /* Update vectorial force */
576             fix1             = _mm_macc_ps(dx11,fscal,fix1);
577             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
578             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
579
580             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
581             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
582             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
583
584             }
585
586             /**************************
587              * CALCULATE INTERACTIONS *
588              **************************/
589
590             if (gmx_mm_any_lt(rsq12,rcutoff2))
591             {
592
593             r12              = _mm_mul_ps(rsq12,rinv12);
594
595             /* EWALD ELECTROSTATICS */
596
597             /* Analytical PME correction */
598             zeta2            = _mm_mul_ps(beta2,rsq12);
599             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
600             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
601             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
602             felec            = _mm_mul_ps(qq12,felec);
603             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
604             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
605             velec            = _mm_mul_ps(qq12,velec);
606
607             d                = _mm_sub_ps(r12,rswitch);
608             d                = _mm_max_ps(d,_mm_setzero_ps());
609             d2               = _mm_mul_ps(d,d);
610             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
611
612             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
613
614             /* Evaluate switch function */
615             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
616             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
617             velec            = _mm_mul_ps(velec,sw);
618             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
619
620             /* Update potential sum for this i atom from the interaction with this j atom. */
621             velec            = _mm_and_ps(velec,cutoff_mask);
622             velecsum         = _mm_add_ps(velecsum,velec);
623
624             fscal            = felec;
625
626             fscal            = _mm_and_ps(fscal,cutoff_mask);
627
628              /* Update vectorial force */
629             fix1             = _mm_macc_ps(dx12,fscal,fix1);
630             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
631             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
632
633             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
634             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
635             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
636
637             }
638
639             /**************************
640              * CALCULATE INTERACTIONS *
641              **************************/
642
643             if (gmx_mm_any_lt(rsq20,rcutoff2))
644             {
645
646             r20              = _mm_mul_ps(rsq20,rinv20);
647
648             /* EWALD ELECTROSTATICS */
649
650             /* Analytical PME correction */
651             zeta2            = _mm_mul_ps(beta2,rsq20);
652             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
653             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
654             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
655             felec            = _mm_mul_ps(qq20,felec);
656             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
657             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
658             velec            = _mm_mul_ps(qq20,velec);
659
660             d                = _mm_sub_ps(r20,rswitch);
661             d                = _mm_max_ps(d,_mm_setzero_ps());
662             d2               = _mm_mul_ps(d,d);
663             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
664
665             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
666
667             /* Evaluate switch function */
668             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
669             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
670             velec            = _mm_mul_ps(velec,sw);
671             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
672
673             /* Update potential sum for this i atom from the interaction with this j atom. */
674             velec            = _mm_and_ps(velec,cutoff_mask);
675             velecsum         = _mm_add_ps(velecsum,velec);
676
677             fscal            = felec;
678
679             fscal            = _mm_and_ps(fscal,cutoff_mask);
680
681              /* Update vectorial force */
682             fix2             = _mm_macc_ps(dx20,fscal,fix2);
683             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
684             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
685
686             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
687             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
688             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
689
690             }
691
692             /**************************
693              * CALCULATE INTERACTIONS *
694              **************************/
695
696             if (gmx_mm_any_lt(rsq21,rcutoff2))
697             {
698
699             r21              = _mm_mul_ps(rsq21,rinv21);
700
701             /* EWALD ELECTROSTATICS */
702
703             /* Analytical PME correction */
704             zeta2            = _mm_mul_ps(beta2,rsq21);
705             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
706             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
707             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
708             felec            = _mm_mul_ps(qq21,felec);
709             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
710             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
711             velec            = _mm_mul_ps(qq21,velec);
712
713             d                = _mm_sub_ps(r21,rswitch);
714             d                = _mm_max_ps(d,_mm_setzero_ps());
715             d2               = _mm_mul_ps(d,d);
716             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
717
718             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
719
720             /* Evaluate switch function */
721             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
722             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
723             velec            = _mm_mul_ps(velec,sw);
724             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
725
726             /* Update potential sum for this i atom from the interaction with this j atom. */
727             velec            = _mm_and_ps(velec,cutoff_mask);
728             velecsum         = _mm_add_ps(velecsum,velec);
729
730             fscal            = felec;
731
732             fscal            = _mm_and_ps(fscal,cutoff_mask);
733
734              /* Update vectorial force */
735             fix2             = _mm_macc_ps(dx21,fscal,fix2);
736             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
737             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
738
739             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
740             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
741             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
742
743             }
744
745             /**************************
746              * CALCULATE INTERACTIONS *
747              **************************/
748
749             if (gmx_mm_any_lt(rsq22,rcutoff2))
750             {
751
752             r22              = _mm_mul_ps(rsq22,rinv22);
753
754             /* EWALD ELECTROSTATICS */
755
756             /* Analytical PME correction */
757             zeta2            = _mm_mul_ps(beta2,rsq22);
758             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
759             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
760             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
761             felec            = _mm_mul_ps(qq22,felec);
762             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
763             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
764             velec            = _mm_mul_ps(qq22,velec);
765
766             d                = _mm_sub_ps(r22,rswitch);
767             d                = _mm_max_ps(d,_mm_setzero_ps());
768             d2               = _mm_mul_ps(d,d);
769             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
770
771             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
772
773             /* Evaluate switch function */
774             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
775             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
776             velec            = _mm_mul_ps(velec,sw);
777             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
778
779             /* Update potential sum for this i atom from the interaction with this j atom. */
780             velec            = _mm_and_ps(velec,cutoff_mask);
781             velecsum         = _mm_add_ps(velecsum,velec);
782
783             fscal            = felec;
784
785             fscal            = _mm_and_ps(fscal,cutoff_mask);
786
787              /* Update vectorial force */
788             fix2             = _mm_macc_ps(dx22,fscal,fix2);
789             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
790             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
791
792             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
793             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
794             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
795
796             }
797
798             fjptrA             = f+j_coord_offsetA;
799             fjptrB             = f+j_coord_offsetB;
800             fjptrC             = f+j_coord_offsetC;
801             fjptrD             = f+j_coord_offsetD;
802
803             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
804                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
805
806             /* Inner loop uses 495 flops */
807         }
808
809         if(jidx<j_index_end)
810         {
811
812             /* Get j neighbor index, and coordinate index */
813             jnrlistA         = jjnr[jidx];
814             jnrlistB         = jjnr[jidx+1];
815             jnrlistC         = jjnr[jidx+2];
816             jnrlistD         = jjnr[jidx+3];
817             /* Sign of each element will be negative for non-real atoms.
818              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
819              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
820              */
821             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
822             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
823             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
824             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
825             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
826             j_coord_offsetA  = DIM*jnrA;
827             j_coord_offsetB  = DIM*jnrB;
828             j_coord_offsetC  = DIM*jnrC;
829             j_coord_offsetD  = DIM*jnrD;
830
831             /* load j atom coordinates */
832             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
833                                               x+j_coord_offsetC,x+j_coord_offsetD,
834                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
835
836             /* Calculate displacement vector */
837             dx00             = _mm_sub_ps(ix0,jx0);
838             dy00             = _mm_sub_ps(iy0,jy0);
839             dz00             = _mm_sub_ps(iz0,jz0);
840             dx01             = _mm_sub_ps(ix0,jx1);
841             dy01             = _mm_sub_ps(iy0,jy1);
842             dz01             = _mm_sub_ps(iz0,jz1);
843             dx02             = _mm_sub_ps(ix0,jx2);
844             dy02             = _mm_sub_ps(iy0,jy2);
845             dz02             = _mm_sub_ps(iz0,jz2);
846             dx10             = _mm_sub_ps(ix1,jx0);
847             dy10             = _mm_sub_ps(iy1,jy0);
848             dz10             = _mm_sub_ps(iz1,jz0);
849             dx11             = _mm_sub_ps(ix1,jx1);
850             dy11             = _mm_sub_ps(iy1,jy1);
851             dz11             = _mm_sub_ps(iz1,jz1);
852             dx12             = _mm_sub_ps(ix1,jx2);
853             dy12             = _mm_sub_ps(iy1,jy2);
854             dz12             = _mm_sub_ps(iz1,jz2);
855             dx20             = _mm_sub_ps(ix2,jx0);
856             dy20             = _mm_sub_ps(iy2,jy0);
857             dz20             = _mm_sub_ps(iz2,jz0);
858             dx21             = _mm_sub_ps(ix2,jx1);
859             dy21             = _mm_sub_ps(iy2,jy1);
860             dz21             = _mm_sub_ps(iz2,jz1);
861             dx22             = _mm_sub_ps(ix2,jx2);
862             dy22             = _mm_sub_ps(iy2,jy2);
863             dz22             = _mm_sub_ps(iz2,jz2);
864
865             /* Calculate squared distance and things based on it */
866             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
867             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
868             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
869             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
870             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
871             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
872             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
873             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
874             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
875
876             rinv00           = gmx_mm_invsqrt_ps(rsq00);
877             rinv01           = gmx_mm_invsqrt_ps(rsq01);
878             rinv02           = gmx_mm_invsqrt_ps(rsq02);
879             rinv10           = gmx_mm_invsqrt_ps(rsq10);
880             rinv11           = gmx_mm_invsqrt_ps(rsq11);
881             rinv12           = gmx_mm_invsqrt_ps(rsq12);
882             rinv20           = gmx_mm_invsqrt_ps(rsq20);
883             rinv21           = gmx_mm_invsqrt_ps(rsq21);
884             rinv22           = gmx_mm_invsqrt_ps(rsq22);
885
886             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
887             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
888             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
889             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
890             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
891             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
892             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
893             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
894             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
895
896             fjx0             = _mm_setzero_ps();
897             fjy0             = _mm_setzero_ps();
898             fjz0             = _mm_setzero_ps();
899             fjx1             = _mm_setzero_ps();
900             fjy1             = _mm_setzero_ps();
901             fjz1             = _mm_setzero_ps();
902             fjx2             = _mm_setzero_ps();
903             fjy2             = _mm_setzero_ps();
904             fjz2             = _mm_setzero_ps();
905
906             /**************************
907              * CALCULATE INTERACTIONS *
908              **************************/
909
910             if (gmx_mm_any_lt(rsq00,rcutoff2))
911             {
912
913             r00              = _mm_mul_ps(rsq00,rinv00);
914             r00              = _mm_andnot_ps(dummy_mask,r00);
915
916             /* EWALD ELECTROSTATICS */
917
918             /* Analytical PME correction */
919             zeta2            = _mm_mul_ps(beta2,rsq00);
920             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
921             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
922             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
923             felec            = _mm_mul_ps(qq00,felec);
924             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
925             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
926             velec            = _mm_mul_ps(qq00,velec);
927
928             /* LENNARD-JONES DISPERSION/REPULSION */
929
930             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
931             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
932             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
933             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
934             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
935
936             d                = _mm_sub_ps(r00,rswitch);
937             d                = _mm_max_ps(d,_mm_setzero_ps());
938             d2               = _mm_mul_ps(d,d);
939             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
940
941             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
942
943             /* Evaluate switch function */
944             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
945             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
946             fvdw             = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
947             velec            = _mm_mul_ps(velec,sw);
948             vvdw             = _mm_mul_ps(vvdw,sw);
949             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
950
951             /* Update potential sum for this i atom from the interaction with this j atom. */
952             velec            = _mm_and_ps(velec,cutoff_mask);
953             velec            = _mm_andnot_ps(dummy_mask,velec);
954             velecsum         = _mm_add_ps(velecsum,velec);
955             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
956             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
957             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
958
959             fscal            = _mm_add_ps(felec,fvdw);
960
961             fscal            = _mm_and_ps(fscal,cutoff_mask);
962
963             fscal            = _mm_andnot_ps(dummy_mask,fscal);
964
965              /* Update vectorial force */
966             fix0             = _mm_macc_ps(dx00,fscal,fix0);
967             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
968             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
969
970             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
971             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
972             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
973
974             }
975
976             /**************************
977              * CALCULATE INTERACTIONS *
978              **************************/
979
980             if (gmx_mm_any_lt(rsq01,rcutoff2))
981             {
982
983             r01              = _mm_mul_ps(rsq01,rinv01);
984             r01              = _mm_andnot_ps(dummy_mask,r01);
985
986             /* EWALD ELECTROSTATICS */
987
988             /* Analytical PME correction */
989             zeta2            = _mm_mul_ps(beta2,rsq01);
990             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
991             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
992             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
993             felec            = _mm_mul_ps(qq01,felec);
994             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
995             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
996             velec            = _mm_mul_ps(qq01,velec);
997
998             d                = _mm_sub_ps(r01,rswitch);
999             d                = _mm_max_ps(d,_mm_setzero_ps());
1000             d2               = _mm_mul_ps(d,d);
1001             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1002
1003             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1004
1005             /* Evaluate switch function */
1006             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1007             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1008             velec            = _mm_mul_ps(velec,sw);
1009             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1010
1011             /* Update potential sum for this i atom from the interaction with this j atom. */
1012             velec            = _mm_and_ps(velec,cutoff_mask);
1013             velec            = _mm_andnot_ps(dummy_mask,velec);
1014             velecsum         = _mm_add_ps(velecsum,velec);
1015
1016             fscal            = felec;
1017
1018             fscal            = _mm_and_ps(fscal,cutoff_mask);
1019
1020             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1021
1022              /* Update vectorial force */
1023             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1024             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1025             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1026
1027             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1028             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1029             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1030
1031             }
1032
1033             /**************************
1034              * CALCULATE INTERACTIONS *
1035              **************************/
1036
1037             if (gmx_mm_any_lt(rsq02,rcutoff2))
1038             {
1039
1040             r02              = _mm_mul_ps(rsq02,rinv02);
1041             r02              = _mm_andnot_ps(dummy_mask,r02);
1042
1043             /* EWALD ELECTROSTATICS */
1044
1045             /* Analytical PME correction */
1046             zeta2            = _mm_mul_ps(beta2,rsq02);
1047             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
1048             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1049             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1050             felec            = _mm_mul_ps(qq02,felec);
1051             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1052             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
1053             velec            = _mm_mul_ps(qq02,velec);
1054
1055             d                = _mm_sub_ps(r02,rswitch);
1056             d                = _mm_max_ps(d,_mm_setzero_ps());
1057             d2               = _mm_mul_ps(d,d);
1058             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1059
1060             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1061
1062             /* Evaluate switch function */
1063             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1064             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1065             velec            = _mm_mul_ps(velec,sw);
1066             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1067
1068             /* Update potential sum for this i atom from the interaction with this j atom. */
1069             velec            = _mm_and_ps(velec,cutoff_mask);
1070             velec            = _mm_andnot_ps(dummy_mask,velec);
1071             velecsum         = _mm_add_ps(velecsum,velec);
1072
1073             fscal            = felec;
1074
1075             fscal            = _mm_and_ps(fscal,cutoff_mask);
1076
1077             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1078
1079              /* Update vectorial force */
1080             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1081             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1082             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1083
1084             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1085             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1086             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1087
1088             }
1089
1090             /**************************
1091              * CALCULATE INTERACTIONS *
1092              **************************/
1093
1094             if (gmx_mm_any_lt(rsq10,rcutoff2))
1095             {
1096
1097             r10              = _mm_mul_ps(rsq10,rinv10);
1098             r10              = _mm_andnot_ps(dummy_mask,r10);
1099
1100             /* EWALD ELECTROSTATICS */
1101
1102             /* Analytical PME correction */
1103             zeta2            = _mm_mul_ps(beta2,rsq10);
1104             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1105             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1106             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1107             felec            = _mm_mul_ps(qq10,felec);
1108             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1109             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
1110             velec            = _mm_mul_ps(qq10,velec);
1111
1112             d                = _mm_sub_ps(r10,rswitch);
1113             d                = _mm_max_ps(d,_mm_setzero_ps());
1114             d2               = _mm_mul_ps(d,d);
1115             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1116
1117             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1118
1119             /* Evaluate switch function */
1120             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1121             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1122             velec            = _mm_mul_ps(velec,sw);
1123             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1124
1125             /* Update potential sum for this i atom from the interaction with this j atom. */
1126             velec            = _mm_and_ps(velec,cutoff_mask);
1127             velec            = _mm_andnot_ps(dummy_mask,velec);
1128             velecsum         = _mm_add_ps(velecsum,velec);
1129
1130             fscal            = felec;
1131
1132             fscal            = _mm_and_ps(fscal,cutoff_mask);
1133
1134             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1135
1136              /* Update vectorial force */
1137             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1138             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1139             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1140
1141             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1142             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1143             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1144
1145             }
1146
1147             /**************************
1148              * CALCULATE INTERACTIONS *
1149              **************************/
1150
1151             if (gmx_mm_any_lt(rsq11,rcutoff2))
1152             {
1153
1154             r11              = _mm_mul_ps(rsq11,rinv11);
1155             r11              = _mm_andnot_ps(dummy_mask,r11);
1156
1157             /* EWALD ELECTROSTATICS */
1158
1159             /* Analytical PME correction */
1160             zeta2            = _mm_mul_ps(beta2,rsq11);
1161             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1162             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1163             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1164             felec            = _mm_mul_ps(qq11,felec);
1165             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1166             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1167             velec            = _mm_mul_ps(qq11,velec);
1168
1169             d                = _mm_sub_ps(r11,rswitch);
1170             d                = _mm_max_ps(d,_mm_setzero_ps());
1171             d2               = _mm_mul_ps(d,d);
1172             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1173
1174             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1175
1176             /* Evaluate switch function */
1177             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1178             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1179             velec            = _mm_mul_ps(velec,sw);
1180             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1181
1182             /* Update potential sum for this i atom from the interaction with this j atom. */
1183             velec            = _mm_and_ps(velec,cutoff_mask);
1184             velec            = _mm_andnot_ps(dummy_mask,velec);
1185             velecsum         = _mm_add_ps(velecsum,velec);
1186
1187             fscal            = felec;
1188
1189             fscal            = _mm_and_ps(fscal,cutoff_mask);
1190
1191             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1192
1193              /* Update vectorial force */
1194             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1195             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1196             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1197
1198             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1199             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1200             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1201
1202             }
1203
1204             /**************************
1205              * CALCULATE INTERACTIONS *
1206              **************************/
1207
1208             if (gmx_mm_any_lt(rsq12,rcutoff2))
1209             {
1210
1211             r12              = _mm_mul_ps(rsq12,rinv12);
1212             r12              = _mm_andnot_ps(dummy_mask,r12);
1213
1214             /* EWALD ELECTROSTATICS */
1215
1216             /* Analytical PME correction */
1217             zeta2            = _mm_mul_ps(beta2,rsq12);
1218             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1219             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1220             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1221             felec            = _mm_mul_ps(qq12,felec);
1222             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1223             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
1224             velec            = _mm_mul_ps(qq12,velec);
1225
1226             d                = _mm_sub_ps(r12,rswitch);
1227             d                = _mm_max_ps(d,_mm_setzero_ps());
1228             d2               = _mm_mul_ps(d,d);
1229             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1230
1231             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1232
1233             /* Evaluate switch function */
1234             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1235             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1236             velec            = _mm_mul_ps(velec,sw);
1237             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1238
1239             /* Update potential sum for this i atom from the interaction with this j atom. */
1240             velec            = _mm_and_ps(velec,cutoff_mask);
1241             velec            = _mm_andnot_ps(dummy_mask,velec);
1242             velecsum         = _mm_add_ps(velecsum,velec);
1243
1244             fscal            = felec;
1245
1246             fscal            = _mm_and_ps(fscal,cutoff_mask);
1247
1248             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1249
1250              /* Update vectorial force */
1251             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1252             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1253             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1254
1255             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1256             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1257             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1258
1259             }
1260
1261             /**************************
1262              * CALCULATE INTERACTIONS *
1263              **************************/
1264
1265             if (gmx_mm_any_lt(rsq20,rcutoff2))
1266             {
1267
1268             r20              = _mm_mul_ps(rsq20,rinv20);
1269             r20              = _mm_andnot_ps(dummy_mask,r20);
1270
1271             /* EWALD ELECTROSTATICS */
1272
1273             /* Analytical PME correction */
1274             zeta2            = _mm_mul_ps(beta2,rsq20);
1275             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
1276             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1277             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1278             felec            = _mm_mul_ps(qq20,felec);
1279             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1280             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
1281             velec            = _mm_mul_ps(qq20,velec);
1282
1283             d                = _mm_sub_ps(r20,rswitch);
1284             d                = _mm_max_ps(d,_mm_setzero_ps());
1285             d2               = _mm_mul_ps(d,d);
1286             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1287
1288             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1289
1290             /* Evaluate switch function */
1291             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1292             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1293             velec            = _mm_mul_ps(velec,sw);
1294             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1295
1296             /* Update potential sum for this i atom from the interaction with this j atom. */
1297             velec            = _mm_and_ps(velec,cutoff_mask);
1298             velec            = _mm_andnot_ps(dummy_mask,velec);
1299             velecsum         = _mm_add_ps(velecsum,velec);
1300
1301             fscal            = felec;
1302
1303             fscal            = _mm_and_ps(fscal,cutoff_mask);
1304
1305             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1306
1307              /* Update vectorial force */
1308             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1309             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1310             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1311
1312             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1313             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1314             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1315
1316             }
1317
1318             /**************************
1319              * CALCULATE INTERACTIONS *
1320              **************************/
1321
1322             if (gmx_mm_any_lt(rsq21,rcutoff2))
1323             {
1324
1325             r21              = _mm_mul_ps(rsq21,rinv21);
1326             r21              = _mm_andnot_ps(dummy_mask,r21);
1327
1328             /* EWALD ELECTROSTATICS */
1329
1330             /* Analytical PME correction */
1331             zeta2            = _mm_mul_ps(beta2,rsq21);
1332             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1333             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1334             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1335             felec            = _mm_mul_ps(qq21,felec);
1336             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1337             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
1338             velec            = _mm_mul_ps(qq21,velec);
1339
1340             d                = _mm_sub_ps(r21,rswitch);
1341             d                = _mm_max_ps(d,_mm_setzero_ps());
1342             d2               = _mm_mul_ps(d,d);
1343             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1344
1345             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1346
1347             /* Evaluate switch function */
1348             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1349             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1350             velec            = _mm_mul_ps(velec,sw);
1351             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1352
1353             /* Update potential sum for this i atom from the interaction with this j atom. */
1354             velec            = _mm_and_ps(velec,cutoff_mask);
1355             velec            = _mm_andnot_ps(dummy_mask,velec);
1356             velecsum         = _mm_add_ps(velecsum,velec);
1357
1358             fscal            = felec;
1359
1360             fscal            = _mm_and_ps(fscal,cutoff_mask);
1361
1362             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1363
1364              /* Update vectorial force */
1365             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1366             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1367             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1368
1369             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1370             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1371             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1372
1373             }
1374
1375             /**************************
1376              * CALCULATE INTERACTIONS *
1377              **************************/
1378
1379             if (gmx_mm_any_lt(rsq22,rcutoff2))
1380             {
1381
1382             r22              = _mm_mul_ps(rsq22,rinv22);
1383             r22              = _mm_andnot_ps(dummy_mask,r22);
1384
1385             /* EWALD ELECTROSTATICS */
1386
1387             /* Analytical PME correction */
1388             zeta2            = _mm_mul_ps(beta2,rsq22);
1389             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1390             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1391             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1392             felec            = _mm_mul_ps(qq22,felec);
1393             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1394             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1395             velec            = _mm_mul_ps(qq22,velec);
1396
1397             d                = _mm_sub_ps(r22,rswitch);
1398             d                = _mm_max_ps(d,_mm_setzero_ps());
1399             d2               = _mm_mul_ps(d,d);
1400             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1401
1402             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1403
1404             /* Evaluate switch function */
1405             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1406             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1407             velec            = _mm_mul_ps(velec,sw);
1408             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1409
1410             /* Update potential sum for this i atom from the interaction with this j atom. */
1411             velec            = _mm_and_ps(velec,cutoff_mask);
1412             velec            = _mm_andnot_ps(dummy_mask,velec);
1413             velecsum         = _mm_add_ps(velecsum,velec);
1414
1415             fscal            = felec;
1416
1417             fscal            = _mm_and_ps(fscal,cutoff_mask);
1418
1419             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1420
1421              /* Update vectorial force */
1422             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1423             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1424             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1425
1426             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1427             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1428             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1429
1430             }
1431
1432             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1433             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1434             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1435             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1436
1437             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1438                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1439
1440             /* Inner loop uses 504 flops */
1441         }
1442
1443         /* End of innermost loop */
1444
1445         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1446                                               f+i_coord_offset,fshift+i_shift_offset);
1447
1448         ggid                        = gid[iidx];
1449         /* Update potential energies */
1450         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1451         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1452
1453         /* Increment number of inner iterations */
1454         inneriter                  += j_index_end - j_index_start;
1455
1456         /* Outer loop uses 20 flops */
1457     }
1458
1459     /* Increment number of outer iterations */
1460     outeriter        += nri;
1461
1462     /* Update outer/inner flops */
1463
1464     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*504);
1465 }
1466 /*
1467  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_single
1468  * Electrostatics interaction: Ewald
1469  * VdW interaction:            LennardJones
1470  * Geometry:                   Water3-Water3
1471  * Calculate force/pot:        Force
1472  */
1473 void
1474 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_128_fma_single
1475                     (t_nblist * gmx_restrict                nlist,
1476                      rvec * gmx_restrict                    xx,
1477                      rvec * gmx_restrict                    ff,
1478                      t_forcerec * gmx_restrict              fr,
1479                      t_mdatoms * gmx_restrict               mdatoms,
1480                      nb_kernel_data_t * gmx_restrict        kernel_data,
1481                      t_nrnb * gmx_restrict                  nrnb)
1482 {
1483     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1484      * just 0 for non-waters.
1485      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1486      * jnr indices corresponding to data put in the four positions in the SIMD register.
1487      */
1488     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1489     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1490     int              jnrA,jnrB,jnrC,jnrD;
1491     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1492     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1493     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1494     real             rcutoff_scalar;
1495     real             *shiftvec,*fshift,*x,*f;
1496     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1497     real             scratch[4*DIM];
1498     __m128           fscal,rcutoff,rcutoff2,jidxall;
1499     int              vdwioffset0;
1500     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1501     int              vdwioffset1;
1502     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1503     int              vdwioffset2;
1504     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1505     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1506     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1507     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1508     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1509     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1510     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1511     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1512     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1513     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1514     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1515     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1516     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1517     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1518     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1519     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1520     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1521     real             *charge;
1522     int              nvdwtype;
1523     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1524     int              *vdwtype;
1525     real             *vdwparam;
1526     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1527     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1528     __m128i          ewitab;
1529     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1530     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1531     real             *ewtab;
1532     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1533     real             rswitch_scalar,d_scalar;
1534     __m128           dummy_mask,cutoff_mask;
1535     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1536     __m128           one     = _mm_set1_ps(1.0);
1537     __m128           two     = _mm_set1_ps(2.0);
1538     x                = xx[0];
1539     f                = ff[0];
1540
1541     nri              = nlist->nri;
1542     iinr             = nlist->iinr;
1543     jindex           = nlist->jindex;
1544     jjnr             = nlist->jjnr;
1545     shiftidx         = nlist->shift;
1546     gid              = nlist->gid;
1547     shiftvec         = fr->shift_vec[0];
1548     fshift           = fr->fshift[0];
1549     facel            = _mm_set1_ps(fr->epsfac);
1550     charge           = mdatoms->chargeA;
1551     nvdwtype         = fr->ntype;
1552     vdwparam         = fr->nbfp;
1553     vdwtype          = mdatoms->typeA;
1554
1555     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1556     beta             = _mm_set1_ps(fr->ic->ewaldcoeff);
1557     beta2            = _mm_mul_ps(beta,beta);
1558     beta3            = _mm_mul_ps(beta,beta2);
1559     ewtab            = fr->ic->tabq_coul_FDV0;
1560     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1561     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1562
1563     /* Setup water-specific parameters */
1564     inr              = nlist->iinr[0];
1565     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1566     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1567     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1568     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1569
1570     jq0              = _mm_set1_ps(charge[inr+0]);
1571     jq1              = _mm_set1_ps(charge[inr+1]);
1572     jq2              = _mm_set1_ps(charge[inr+2]);
1573     vdwjidx0A        = 2*vdwtype[inr+0];
1574     qq00             = _mm_mul_ps(iq0,jq0);
1575     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1576     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1577     qq01             = _mm_mul_ps(iq0,jq1);
1578     qq02             = _mm_mul_ps(iq0,jq2);
1579     qq10             = _mm_mul_ps(iq1,jq0);
1580     qq11             = _mm_mul_ps(iq1,jq1);
1581     qq12             = _mm_mul_ps(iq1,jq2);
1582     qq20             = _mm_mul_ps(iq2,jq0);
1583     qq21             = _mm_mul_ps(iq2,jq1);
1584     qq22             = _mm_mul_ps(iq2,jq2);
1585
1586     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1587     rcutoff_scalar   = fr->rcoulomb;
1588     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1589     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1590
1591     rswitch_scalar   = fr->rcoulomb_switch;
1592     rswitch          = _mm_set1_ps(rswitch_scalar);
1593     /* Setup switch parameters */
1594     d_scalar         = rcutoff_scalar-rswitch_scalar;
1595     d                = _mm_set1_ps(d_scalar);
1596     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1597     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1598     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1599     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1600     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1601     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1602
1603     /* Avoid stupid compiler warnings */
1604     jnrA = jnrB = jnrC = jnrD = 0;
1605     j_coord_offsetA = 0;
1606     j_coord_offsetB = 0;
1607     j_coord_offsetC = 0;
1608     j_coord_offsetD = 0;
1609
1610     outeriter        = 0;
1611     inneriter        = 0;
1612
1613     for(iidx=0;iidx<4*DIM;iidx++)
1614     {
1615         scratch[iidx] = 0.0;
1616     }
1617
1618     /* Start outer loop over neighborlists */
1619     for(iidx=0; iidx<nri; iidx++)
1620     {
1621         /* Load shift vector for this list */
1622         i_shift_offset   = DIM*shiftidx[iidx];
1623
1624         /* Load limits for loop over neighbors */
1625         j_index_start    = jindex[iidx];
1626         j_index_end      = jindex[iidx+1];
1627
1628         /* Get outer coordinate index */
1629         inr              = iinr[iidx];
1630         i_coord_offset   = DIM*inr;
1631
1632         /* Load i particle coords and add shift vector */
1633         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1634                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1635
1636         fix0             = _mm_setzero_ps();
1637         fiy0             = _mm_setzero_ps();
1638         fiz0             = _mm_setzero_ps();
1639         fix1             = _mm_setzero_ps();
1640         fiy1             = _mm_setzero_ps();
1641         fiz1             = _mm_setzero_ps();
1642         fix2             = _mm_setzero_ps();
1643         fiy2             = _mm_setzero_ps();
1644         fiz2             = _mm_setzero_ps();
1645
1646         /* Start inner kernel loop */
1647         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1648         {
1649
1650             /* Get j neighbor index, and coordinate index */
1651             jnrA             = jjnr[jidx];
1652             jnrB             = jjnr[jidx+1];
1653             jnrC             = jjnr[jidx+2];
1654             jnrD             = jjnr[jidx+3];
1655             j_coord_offsetA  = DIM*jnrA;
1656             j_coord_offsetB  = DIM*jnrB;
1657             j_coord_offsetC  = DIM*jnrC;
1658             j_coord_offsetD  = DIM*jnrD;
1659
1660             /* load j atom coordinates */
1661             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1662                                               x+j_coord_offsetC,x+j_coord_offsetD,
1663                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1664
1665             /* Calculate displacement vector */
1666             dx00             = _mm_sub_ps(ix0,jx0);
1667             dy00             = _mm_sub_ps(iy0,jy0);
1668             dz00             = _mm_sub_ps(iz0,jz0);
1669             dx01             = _mm_sub_ps(ix0,jx1);
1670             dy01             = _mm_sub_ps(iy0,jy1);
1671             dz01             = _mm_sub_ps(iz0,jz1);
1672             dx02             = _mm_sub_ps(ix0,jx2);
1673             dy02             = _mm_sub_ps(iy0,jy2);
1674             dz02             = _mm_sub_ps(iz0,jz2);
1675             dx10             = _mm_sub_ps(ix1,jx0);
1676             dy10             = _mm_sub_ps(iy1,jy0);
1677             dz10             = _mm_sub_ps(iz1,jz0);
1678             dx11             = _mm_sub_ps(ix1,jx1);
1679             dy11             = _mm_sub_ps(iy1,jy1);
1680             dz11             = _mm_sub_ps(iz1,jz1);
1681             dx12             = _mm_sub_ps(ix1,jx2);
1682             dy12             = _mm_sub_ps(iy1,jy2);
1683             dz12             = _mm_sub_ps(iz1,jz2);
1684             dx20             = _mm_sub_ps(ix2,jx0);
1685             dy20             = _mm_sub_ps(iy2,jy0);
1686             dz20             = _mm_sub_ps(iz2,jz0);
1687             dx21             = _mm_sub_ps(ix2,jx1);
1688             dy21             = _mm_sub_ps(iy2,jy1);
1689             dz21             = _mm_sub_ps(iz2,jz1);
1690             dx22             = _mm_sub_ps(ix2,jx2);
1691             dy22             = _mm_sub_ps(iy2,jy2);
1692             dz22             = _mm_sub_ps(iz2,jz2);
1693
1694             /* Calculate squared distance and things based on it */
1695             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1696             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1697             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1698             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1699             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1700             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1701             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1702             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1703             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1704
1705             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1706             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1707             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1708             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1709             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1710             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1711             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1712             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1713             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1714
1715             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1716             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1717             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1718             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1719             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1720             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1721             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1722             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1723             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1724
1725             fjx0             = _mm_setzero_ps();
1726             fjy0             = _mm_setzero_ps();
1727             fjz0             = _mm_setzero_ps();
1728             fjx1             = _mm_setzero_ps();
1729             fjy1             = _mm_setzero_ps();
1730             fjz1             = _mm_setzero_ps();
1731             fjx2             = _mm_setzero_ps();
1732             fjy2             = _mm_setzero_ps();
1733             fjz2             = _mm_setzero_ps();
1734
1735             /**************************
1736              * CALCULATE INTERACTIONS *
1737              **************************/
1738
1739             if (gmx_mm_any_lt(rsq00,rcutoff2))
1740             {
1741
1742             r00              = _mm_mul_ps(rsq00,rinv00);
1743
1744             /* EWALD ELECTROSTATICS */
1745
1746             /* Analytical PME correction */
1747             zeta2            = _mm_mul_ps(beta2,rsq00);
1748             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
1749             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1750             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1751             felec            = _mm_mul_ps(qq00,felec);
1752             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1753             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
1754             velec            = _mm_mul_ps(qq00,velec);
1755
1756             /* LENNARD-JONES DISPERSION/REPULSION */
1757
1758             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1759             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1760             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1761             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
1762             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1763
1764             d                = _mm_sub_ps(r00,rswitch);
1765             d                = _mm_max_ps(d,_mm_setzero_ps());
1766             d2               = _mm_mul_ps(d,d);
1767             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1768
1769             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1770
1771             /* Evaluate switch function */
1772             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1773             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1774             fvdw             = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1775             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1776
1777             fscal            = _mm_add_ps(felec,fvdw);
1778
1779             fscal            = _mm_and_ps(fscal,cutoff_mask);
1780
1781              /* Update vectorial force */
1782             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1783             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1784             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1785
1786             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1787             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1788             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1789
1790             }
1791
1792             /**************************
1793              * CALCULATE INTERACTIONS *
1794              **************************/
1795
1796             if (gmx_mm_any_lt(rsq01,rcutoff2))
1797             {
1798
1799             r01              = _mm_mul_ps(rsq01,rinv01);
1800
1801             /* EWALD ELECTROSTATICS */
1802
1803             /* Analytical PME correction */
1804             zeta2            = _mm_mul_ps(beta2,rsq01);
1805             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
1806             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1807             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1808             felec            = _mm_mul_ps(qq01,felec);
1809             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1810             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
1811             velec            = _mm_mul_ps(qq01,velec);
1812
1813             d                = _mm_sub_ps(r01,rswitch);
1814             d                = _mm_max_ps(d,_mm_setzero_ps());
1815             d2               = _mm_mul_ps(d,d);
1816             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1817
1818             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1819
1820             /* Evaluate switch function */
1821             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1822             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1823             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1824
1825             fscal            = felec;
1826
1827             fscal            = _mm_and_ps(fscal,cutoff_mask);
1828
1829              /* Update vectorial force */
1830             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1831             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1832             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1833
1834             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1835             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1836             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1837
1838             }
1839
1840             /**************************
1841              * CALCULATE INTERACTIONS *
1842              **************************/
1843
1844             if (gmx_mm_any_lt(rsq02,rcutoff2))
1845             {
1846
1847             r02              = _mm_mul_ps(rsq02,rinv02);
1848
1849             /* EWALD ELECTROSTATICS */
1850
1851             /* Analytical PME correction */
1852             zeta2            = _mm_mul_ps(beta2,rsq02);
1853             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
1854             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1855             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1856             felec            = _mm_mul_ps(qq02,felec);
1857             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1858             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
1859             velec            = _mm_mul_ps(qq02,velec);
1860
1861             d                = _mm_sub_ps(r02,rswitch);
1862             d                = _mm_max_ps(d,_mm_setzero_ps());
1863             d2               = _mm_mul_ps(d,d);
1864             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1865
1866             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1867
1868             /* Evaluate switch function */
1869             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1870             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1871             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1872
1873             fscal            = felec;
1874
1875             fscal            = _mm_and_ps(fscal,cutoff_mask);
1876
1877              /* Update vectorial force */
1878             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1879             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1880             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1881
1882             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1883             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1884             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1885
1886             }
1887
1888             /**************************
1889              * CALCULATE INTERACTIONS *
1890              **************************/
1891
1892             if (gmx_mm_any_lt(rsq10,rcutoff2))
1893             {
1894
1895             r10              = _mm_mul_ps(rsq10,rinv10);
1896
1897             /* EWALD ELECTROSTATICS */
1898
1899             /* Analytical PME correction */
1900             zeta2            = _mm_mul_ps(beta2,rsq10);
1901             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1902             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1903             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1904             felec            = _mm_mul_ps(qq10,felec);
1905             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1906             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
1907             velec            = _mm_mul_ps(qq10,velec);
1908
1909             d                = _mm_sub_ps(r10,rswitch);
1910             d                = _mm_max_ps(d,_mm_setzero_ps());
1911             d2               = _mm_mul_ps(d,d);
1912             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1913
1914             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1915
1916             /* Evaluate switch function */
1917             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1918             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1919             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1920
1921             fscal            = felec;
1922
1923             fscal            = _mm_and_ps(fscal,cutoff_mask);
1924
1925              /* Update vectorial force */
1926             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1927             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1928             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1929
1930             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1931             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1932             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1933
1934             }
1935
1936             /**************************
1937              * CALCULATE INTERACTIONS *
1938              **************************/
1939
1940             if (gmx_mm_any_lt(rsq11,rcutoff2))
1941             {
1942
1943             r11              = _mm_mul_ps(rsq11,rinv11);
1944
1945             /* EWALD ELECTROSTATICS */
1946
1947             /* Analytical PME correction */
1948             zeta2            = _mm_mul_ps(beta2,rsq11);
1949             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1950             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1951             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1952             felec            = _mm_mul_ps(qq11,felec);
1953             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1954             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1955             velec            = _mm_mul_ps(qq11,velec);
1956
1957             d                = _mm_sub_ps(r11,rswitch);
1958             d                = _mm_max_ps(d,_mm_setzero_ps());
1959             d2               = _mm_mul_ps(d,d);
1960             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1961
1962             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1963
1964             /* Evaluate switch function */
1965             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1966             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1967             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1968
1969             fscal            = felec;
1970
1971             fscal            = _mm_and_ps(fscal,cutoff_mask);
1972
1973              /* Update vectorial force */
1974             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1975             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1976             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1977
1978             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1979             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1980             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1981
1982             }
1983
1984             /**************************
1985              * CALCULATE INTERACTIONS *
1986              **************************/
1987
1988             if (gmx_mm_any_lt(rsq12,rcutoff2))
1989             {
1990
1991             r12              = _mm_mul_ps(rsq12,rinv12);
1992
1993             /* EWALD ELECTROSTATICS */
1994
1995             /* Analytical PME correction */
1996             zeta2            = _mm_mul_ps(beta2,rsq12);
1997             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1998             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1999             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2000             felec            = _mm_mul_ps(qq12,felec);
2001             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2002             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
2003             velec            = _mm_mul_ps(qq12,velec);
2004
2005             d                = _mm_sub_ps(r12,rswitch);
2006             d                = _mm_max_ps(d,_mm_setzero_ps());
2007             d2               = _mm_mul_ps(d,d);
2008             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2009
2010             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2011
2012             /* Evaluate switch function */
2013             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2014             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2015             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2016
2017             fscal            = felec;
2018
2019             fscal            = _mm_and_ps(fscal,cutoff_mask);
2020
2021              /* Update vectorial force */
2022             fix1             = _mm_macc_ps(dx12,fscal,fix1);
2023             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
2024             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
2025
2026             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
2027             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
2028             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
2029
2030             }
2031
2032             /**************************
2033              * CALCULATE INTERACTIONS *
2034              **************************/
2035
2036             if (gmx_mm_any_lt(rsq20,rcutoff2))
2037             {
2038
2039             r20              = _mm_mul_ps(rsq20,rinv20);
2040
2041             /* EWALD ELECTROSTATICS */
2042
2043             /* Analytical PME correction */
2044             zeta2            = _mm_mul_ps(beta2,rsq20);
2045             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
2046             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2047             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2048             felec            = _mm_mul_ps(qq20,felec);
2049             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2050             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
2051             velec            = _mm_mul_ps(qq20,velec);
2052
2053             d                = _mm_sub_ps(r20,rswitch);
2054             d                = _mm_max_ps(d,_mm_setzero_ps());
2055             d2               = _mm_mul_ps(d,d);
2056             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2057
2058             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2059
2060             /* Evaluate switch function */
2061             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2062             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2063             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2064
2065             fscal            = felec;
2066
2067             fscal            = _mm_and_ps(fscal,cutoff_mask);
2068
2069              /* Update vectorial force */
2070             fix2             = _mm_macc_ps(dx20,fscal,fix2);
2071             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
2072             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
2073
2074             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
2075             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
2076             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
2077
2078             }
2079
2080             /**************************
2081              * CALCULATE INTERACTIONS *
2082              **************************/
2083
2084             if (gmx_mm_any_lt(rsq21,rcutoff2))
2085             {
2086
2087             r21              = _mm_mul_ps(rsq21,rinv21);
2088
2089             /* EWALD ELECTROSTATICS */
2090
2091             /* Analytical PME correction */
2092             zeta2            = _mm_mul_ps(beta2,rsq21);
2093             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
2094             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2095             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2096             felec            = _mm_mul_ps(qq21,felec);
2097             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2098             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2099             velec            = _mm_mul_ps(qq21,velec);
2100
2101             d                = _mm_sub_ps(r21,rswitch);
2102             d                = _mm_max_ps(d,_mm_setzero_ps());
2103             d2               = _mm_mul_ps(d,d);
2104             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2105
2106             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2107
2108             /* Evaluate switch function */
2109             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2110             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2111             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2112
2113             fscal            = felec;
2114
2115             fscal            = _mm_and_ps(fscal,cutoff_mask);
2116
2117              /* Update vectorial force */
2118             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2119             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2120             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2121
2122             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2123             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2124             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2125
2126             }
2127
2128             /**************************
2129              * CALCULATE INTERACTIONS *
2130              **************************/
2131
2132             if (gmx_mm_any_lt(rsq22,rcutoff2))
2133             {
2134
2135             r22              = _mm_mul_ps(rsq22,rinv22);
2136
2137             /* EWALD ELECTROSTATICS */
2138
2139             /* Analytical PME correction */
2140             zeta2            = _mm_mul_ps(beta2,rsq22);
2141             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
2142             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2143             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2144             felec            = _mm_mul_ps(qq22,felec);
2145             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2146             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2147             velec            = _mm_mul_ps(qq22,velec);
2148
2149             d                = _mm_sub_ps(r22,rswitch);
2150             d                = _mm_max_ps(d,_mm_setzero_ps());
2151             d2               = _mm_mul_ps(d,d);
2152             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2153
2154             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2155
2156             /* Evaluate switch function */
2157             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2158             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2159             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2160
2161             fscal            = felec;
2162
2163             fscal            = _mm_and_ps(fscal,cutoff_mask);
2164
2165              /* Update vectorial force */
2166             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2167             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2168             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2169
2170             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2171             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2172             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2173
2174             }
2175
2176             fjptrA             = f+j_coord_offsetA;
2177             fjptrB             = f+j_coord_offsetB;
2178             fjptrC             = f+j_coord_offsetC;
2179             fjptrD             = f+j_coord_offsetD;
2180
2181             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2182                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2183
2184             /* Inner loop uses 465 flops */
2185         }
2186
2187         if(jidx<j_index_end)
2188         {
2189
2190             /* Get j neighbor index, and coordinate index */
2191             jnrlistA         = jjnr[jidx];
2192             jnrlistB         = jjnr[jidx+1];
2193             jnrlistC         = jjnr[jidx+2];
2194             jnrlistD         = jjnr[jidx+3];
2195             /* Sign of each element will be negative for non-real atoms.
2196              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2197              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2198              */
2199             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2200             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2201             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2202             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2203             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2204             j_coord_offsetA  = DIM*jnrA;
2205             j_coord_offsetB  = DIM*jnrB;
2206             j_coord_offsetC  = DIM*jnrC;
2207             j_coord_offsetD  = DIM*jnrD;
2208
2209             /* load j atom coordinates */
2210             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2211                                               x+j_coord_offsetC,x+j_coord_offsetD,
2212                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2213
2214             /* Calculate displacement vector */
2215             dx00             = _mm_sub_ps(ix0,jx0);
2216             dy00             = _mm_sub_ps(iy0,jy0);
2217             dz00             = _mm_sub_ps(iz0,jz0);
2218             dx01             = _mm_sub_ps(ix0,jx1);
2219             dy01             = _mm_sub_ps(iy0,jy1);
2220             dz01             = _mm_sub_ps(iz0,jz1);
2221             dx02             = _mm_sub_ps(ix0,jx2);
2222             dy02             = _mm_sub_ps(iy0,jy2);
2223             dz02             = _mm_sub_ps(iz0,jz2);
2224             dx10             = _mm_sub_ps(ix1,jx0);
2225             dy10             = _mm_sub_ps(iy1,jy0);
2226             dz10             = _mm_sub_ps(iz1,jz0);
2227             dx11             = _mm_sub_ps(ix1,jx1);
2228             dy11             = _mm_sub_ps(iy1,jy1);
2229             dz11             = _mm_sub_ps(iz1,jz1);
2230             dx12             = _mm_sub_ps(ix1,jx2);
2231             dy12             = _mm_sub_ps(iy1,jy2);
2232             dz12             = _mm_sub_ps(iz1,jz2);
2233             dx20             = _mm_sub_ps(ix2,jx0);
2234             dy20             = _mm_sub_ps(iy2,jy0);
2235             dz20             = _mm_sub_ps(iz2,jz0);
2236             dx21             = _mm_sub_ps(ix2,jx1);
2237             dy21             = _mm_sub_ps(iy2,jy1);
2238             dz21             = _mm_sub_ps(iz2,jz1);
2239             dx22             = _mm_sub_ps(ix2,jx2);
2240             dy22             = _mm_sub_ps(iy2,jy2);
2241             dz22             = _mm_sub_ps(iz2,jz2);
2242
2243             /* Calculate squared distance and things based on it */
2244             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2245             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2246             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2247             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2248             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2249             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2250             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2251             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2252             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2253
2254             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2255             rinv01           = gmx_mm_invsqrt_ps(rsq01);
2256             rinv02           = gmx_mm_invsqrt_ps(rsq02);
2257             rinv10           = gmx_mm_invsqrt_ps(rsq10);
2258             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2259             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2260             rinv20           = gmx_mm_invsqrt_ps(rsq20);
2261             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2262             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2263
2264             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2265             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
2266             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
2267             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
2268             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2269             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2270             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
2271             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2272             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2273
2274             fjx0             = _mm_setzero_ps();
2275             fjy0             = _mm_setzero_ps();
2276             fjz0             = _mm_setzero_ps();
2277             fjx1             = _mm_setzero_ps();
2278             fjy1             = _mm_setzero_ps();
2279             fjz1             = _mm_setzero_ps();
2280             fjx2             = _mm_setzero_ps();
2281             fjy2             = _mm_setzero_ps();
2282             fjz2             = _mm_setzero_ps();
2283
2284             /**************************
2285              * CALCULATE INTERACTIONS *
2286              **************************/
2287
2288             if (gmx_mm_any_lt(rsq00,rcutoff2))
2289             {
2290
2291             r00              = _mm_mul_ps(rsq00,rinv00);
2292             r00              = _mm_andnot_ps(dummy_mask,r00);
2293
2294             /* EWALD ELECTROSTATICS */
2295
2296             /* Analytical PME correction */
2297             zeta2            = _mm_mul_ps(beta2,rsq00);
2298             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
2299             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2300             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2301             felec            = _mm_mul_ps(qq00,felec);
2302             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2303             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
2304             velec            = _mm_mul_ps(qq00,velec);
2305
2306             /* LENNARD-JONES DISPERSION/REPULSION */
2307
2308             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2309             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
2310             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2311             vvdw             = _mm_msub_ps(vvdw12,one_twelfth,_mm_mul_ps(vvdw6,one_sixth));
2312             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2313
2314             d                = _mm_sub_ps(r00,rswitch);
2315             d                = _mm_max_ps(d,_mm_setzero_ps());
2316             d2               = _mm_mul_ps(d,d);
2317             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2318
2319             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2320
2321             /* Evaluate switch function */
2322             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2323             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2324             fvdw             = _mm_msub_ps( fvdw,sw , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2325             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2326
2327             fscal            = _mm_add_ps(felec,fvdw);
2328
2329             fscal            = _mm_and_ps(fscal,cutoff_mask);
2330
2331             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2332
2333              /* Update vectorial force */
2334             fix0             = _mm_macc_ps(dx00,fscal,fix0);
2335             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
2336             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
2337
2338             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
2339             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
2340             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
2341
2342             }
2343
2344             /**************************
2345              * CALCULATE INTERACTIONS *
2346              **************************/
2347
2348             if (gmx_mm_any_lt(rsq01,rcutoff2))
2349             {
2350
2351             r01              = _mm_mul_ps(rsq01,rinv01);
2352             r01              = _mm_andnot_ps(dummy_mask,r01);
2353
2354             /* EWALD ELECTROSTATICS */
2355
2356             /* Analytical PME correction */
2357             zeta2            = _mm_mul_ps(beta2,rsq01);
2358             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
2359             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2360             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2361             felec            = _mm_mul_ps(qq01,felec);
2362             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2363             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
2364             velec            = _mm_mul_ps(qq01,velec);
2365
2366             d                = _mm_sub_ps(r01,rswitch);
2367             d                = _mm_max_ps(d,_mm_setzero_ps());
2368             d2               = _mm_mul_ps(d,d);
2369             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2370
2371             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2372
2373             /* Evaluate switch function */
2374             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2375             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2376             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
2377
2378             fscal            = felec;
2379
2380             fscal            = _mm_and_ps(fscal,cutoff_mask);
2381
2382             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2383
2384              /* Update vectorial force */
2385             fix0             = _mm_macc_ps(dx01,fscal,fix0);
2386             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
2387             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
2388
2389             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
2390             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
2391             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
2392
2393             }
2394
2395             /**************************
2396              * CALCULATE INTERACTIONS *
2397              **************************/
2398
2399             if (gmx_mm_any_lt(rsq02,rcutoff2))
2400             {
2401
2402             r02              = _mm_mul_ps(rsq02,rinv02);
2403             r02              = _mm_andnot_ps(dummy_mask,r02);
2404
2405             /* EWALD ELECTROSTATICS */
2406
2407             /* Analytical PME correction */
2408             zeta2            = _mm_mul_ps(beta2,rsq02);
2409             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
2410             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2411             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2412             felec            = _mm_mul_ps(qq02,felec);
2413             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2414             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
2415             velec            = _mm_mul_ps(qq02,velec);
2416
2417             d                = _mm_sub_ps(r02,rswitch);
2418             d                = _mm_max_ps(d,_mm_setzero_ps());
2419             d2               = _mm_mul_ps(d,d);
2420             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2421
2422             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2423
2424             /* Evaluate switch function */
2425             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2426             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2427             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
2428
2429             fscal            = felec;
2430
2431             fscal            = _mm_and_ps(fscal,cutoff_mask);
2432
2433             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2434
2435              /* Update vectorial force */
2436             fix0             = _mm_macc_ps(dx02,fscal,fix0);
2437             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
2438             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
2439
2440             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
2441             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
2442             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
2443
2444             }
2445
2446             /**************************
2447              * CALCULATE INTERACTIONS *
2448              **************************/
2449
2450             if (gmx_mm_any_lt(rsq10,rcutoff2))
2451             {
2452
2453             r10              = _mm_mul_ps(rsq10,rinv10);
2454             r10              = _mm_andnot_ps(dummy_mask,r10);
2455
2456             /* EWALD ELECTROSTATICS */
2457
2458             /* Analytical PME correction */
2459             zeta2            = _mm_mul_ps(beta2,rsq10);
2460             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
2461             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2462             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2463             felec            = _mm_mul_ps(qq10,felec);
2464             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2465             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
2466             velec            = _mm_mul_ps(qq10,velec);
2467
2468             d                = _mm_sub_ps(r10,rswitch);
2469             d                = _mm_max_ps(d,_mm_setzero_ps());
2470             d2               = _mm_mul_ps(d,d);
2471             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2472
2473             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2474
2475             /* Evaluate switch function */
2476             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2477             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2478             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
2479
2480             fscal            = felec;
2481
2482             fscal            = _mm_and_ps(fscal,cutoff_mask);
2483
2484             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2485
2486              /* Update vectorial force */
2487             fix1             = _mm_macc_ps(dx10,fscal,fix1);
2488             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
2489             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
2490
2491             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
2492             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
2493             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
2494
2495             }
2496
2497             /**************************
2498              * CALCULATE INTERACTIONS *
2499              **************************/
2500
2501             if (gmx_mm_any_lt(rsq11,rcutoff2))
2502             {
2503
2504             r11              = _mm_mul_ps(rsq11,rinv11);
2505             r11              = _mm_andnot_ps(dummy_mask,r11);
2506
2507             /* EWALD ELECTROSTATICS */
2508
2509             /* Analytical PME correction */
2510             zeta2            = _mm_mul_ps(beta2,rsq11);
2511             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
2512             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2513             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2514             felec            = _mm_mul_ps(qq11,felec);
2515             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2516             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
2517             velec            = _mm_mul_ps(qq11,velec);
2518
2519             d                = _mm_sub_ps(r11,rswitch);
2520             d                = _mm_max_ps(d,_mm_setzero_ps());
2521             d2               = _mm_mul_ps(d,d);
2522             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2523
2524             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2525
2526             /* Evaluate switch function */
2527             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2528             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2529             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2530
2531             fscal            = felec;
2532
2533             fscal            = _mm_and_ps(fscal,cutoff_mask);
2534
2535             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2536
2537              /* Update vectorial force */
2538             fix1             = _mm_macc_ps(dx11,fscal,fix1);
2539             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
2540             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
2541
2542             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
2543             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
2544             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
2545
2546             }
2547
2548             /**************************
2549              * CALCULATE INTERACTIONS *
2550              **************************/
2551
2552             if (gmx_mm_any_lt(rsq12,rcutoff2))
2553             {
2554
2555             r12              = _mm_mul_ps(rsq12,rinv12);
2556             r12              = _mm_andnot_ps(dummy_mask,r12);
2557
2558             /* EWALD ELECTROSTATICS */
2559
2560             /* Analytical PME correction */
2561             zeta2            = _mm_mul_ps(beta2,rsq12);
2562             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
2563             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2564             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2565             felec            = _mm_mul_ps(qq12,felec);
2566             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2567             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
2568             velec            = _mm_mul_ps(qq12,velec);
2569
2570             d                = _mm_sub_ps(r12,rswitch);
2571             d                = _mm_max_ps(d,_mm_setzero_ps());
2572             d2               = _mm_mul_ps(d,d);
2573             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2574
2575             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2576
2577             /* Evaluate switch function */
2578             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2579             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2580             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2581
2582             fscal            = felec;
2583
2584             fscal            = _mm_and_ps(fscal,cutoff_mask);
2585
2586             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2587
2588              /* Update vectorial force */
2589             fix1             = _mm_macc_ps(dx12,fscal,fix1);
2590             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
2591             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
2592
2593             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
2594             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
2595             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
2596
2597             }
2598
2599             /**************************
2600              * CALCULATE INTERACTIONS *
2601              **************************/
2602
2603             if (gmx_mm_any_lt(rsq20,rcutoff2))
2604             {
2605
2606             r20              = _mm_mul_ps(rsq20,rinv20);
2607             r20              = _mm_andnot_ps(dummy_mask,r20);
2608
2609             /* EWALD ELECTROSTATICS */
2610
2611             /* Analytical PME correction */
2612             zeta2            = _mm_mul_ps(beta2,rsq20);
2613             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
2614             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2615             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2616             felec            = _mm_mul_ps(qq20,felec);
2617             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2618             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
2619             velec            = _mm_mul_ps(qq20,velec);
2620
2621             d                = _mm_sub_ps(r20,rswitch);
2622             d                = _mm_max_ps(d,_mm_setzero_ps());
2623             d2               = _mm_mul_ps(d,d);
2624             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2625
2626             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2627
2628             /* Evaluate switch function */
2629             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2630             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2631             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2632
2633             fscal            = felec;
2634
2635             fscal            = _mm_and_ps(fscal,cutoff_mask);
2636
2637             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2638
2639              /* Update vectorial force */
2640             fix2             = _mm_macc_ps(dx20,fscal,fix2);
2641             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
2642             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
2643
2644             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
2645             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
2646             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
2647
2648             }
2649
2650             /**************************
2651              * CALCULATE INTERACTIONS *
2652              **************************/
2653
2654             if (gmx_mm_any_lt(rsq21,rcutoff2))
2655             {
2656
2657             r21              = _mm_mul_ps(rsq21,rinv21);
2658             r21              = _mm_andnot_ps(dummy_mask,r21);
2659
2660             /* EWALD ELECTROSTATICS */
2661
2662             /* Analytical PME correction */
2663             zeta2            = _mm_mul_ps(beta2,rsq21);
2664             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
2665             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2666             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2667             felec            = _mm_mul_ps(qq21,felec);
2668             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2669             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2670             velec            = _mm_mul_ps(qq21,velec);
2671
2672             d                = _mm_sub_ps(r21,rswitch);
2673             d                = _mm_max_ps(d,_mm_setzero_ps());
2674             d2               = _mm_mul_ps(d,d);
2675             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2676
2677             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2678
2679             /* Evaluate switch function */
2680             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2681             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2682             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2683
2684             fscal            = felec;
2685
2686             fscal            = _mm_and_ps(fscal,cutoff_mask);
2687
2688             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2689
2690              /* Update vectorial force */
2691             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2692             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2693             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2694
2695             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2696             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2697             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2698
2699             }
2700
2701             /**************************
2702              * CALCULATE INTERACTIONS *
2703              **************************/
2704
2705             if (gmx_mm_any_lt(rsq22,rcutoff2))
2706             {
2707
2708             r22              = _mm_mul_ps(rsq22,rinv22);
2709             r22              = _mm_andnot_ps(dummy_mask,r22);
2710
2711             /* EWALD ELECTROSTATICS */
2712
2713             /* Analytical PME correction */
2714             zeta2            = _mm_mul_ps(beta2,rsq22);
2715             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
2716             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2717             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2718             felec            = _mm_mul_ps(qq22,felec);
2719             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2720             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2721             velec            = _mm_mul_ps(qq22,velec);
2722
2723             d                = _mm_sub_ps(r22,rswitch);
2724             d                = _mm_max_ps(d,_mm_setzero_ps());
2725             d2               = _mm_mul_ps(d,d);
2726             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2727
2728             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2729
2730             /* Evaluate switch function */
2731             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2732             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2733             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2734
2735             fscal            = felec;
2736
2737             fscal            = _mm_and_ps(fscal,cutoff_mask);
2738
2739             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2740
2741              /* Update vectorial force */
2742             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2743             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2744             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2745
2746             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2747             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2748             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2749
2750             }
2751
2752             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2753             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2754             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2755             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2756
2757             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2758                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2759
2760             /* Inner loop uses 474 flops */
2761         }
2762
2763         /* End of innermost loop */
2764
2765         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2766                                               f+i_coord_offset,fshift+i_shift_offset);
2767
2768         /* Increment number of inner iterations */
2769         inneriter                  += j_index_end - j_index_start;
2770
2771         /* Outer loop uses 18 flops */
2772     }
2773
2774     /* Increment number of outer iterations */
2775     outeriter        += nri;
2776
2777     /* Update outer/inner flops */
2778
2779     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*474);
2780 }