Created SIMD module
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_128_fma_single / nb_kernel_ElecEwSw_VdwNone_GeomW3W3_avx_128_fma_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_128_fma_single kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_128_fma_single.h"
50 #include "kernelutil_x86_avx_128_fma_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_128_fma_single
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            None
56  * Geometry:                   Water3-Water3
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_128_fma_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset0;
86     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     int              vdwioffset1;
88     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     int              vdwioffset2;
90     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     __m128i          ewitab;
109     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
110     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
111     real             *ewtab;
112     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
113     real             rswitch_scalar,d_scalar;
114     __m128           dummy_mask,cutoff_mask;
115     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
116     __m128           one     = _mm_set1_ps(1.0);
117     __m128           two     = _mm_set1_ps(2.0);
118     x                = xx[0];
119     f                = ff[0];
120
121     nri              = nlist->nri;
122     iinr             = nlist->iinr;
123     jindex           = nlist->jindex;
124     jjnr             = nlist->jjnr;
125     shiftidx         = nlist->shift;
126     gid              = nlist->gid;
127     shiftvec         = fr->shift_vec[0];
128     fshift           = fr->fshift[0];
129     facel            = _mm_set1_ps(fr->epsfac);
130     charge           = mdatoms->chargeA;
131
132     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
133     beta             = _mm_set1_ps(fr->ic->ewaldcoeff);
134     beta2            = _mm_mul_ps(beta,beta);
135     beta3            = _mm_mul_ps(beta,beta2);
136     ewtab            = fr->ic->tabq_coul_FDV0;
137     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
138     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
139
140     /* Setup water-specific parameters */
141     inr              = nlist->iinr[0];
142     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
143     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
144     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
145
146     jq0              = _mm_set1_ps(charge[inr+0]);
147     jq1              = _mm_set1_ps(charge[inr+1]);
148     jq2              = _mm_set1_ps(charge[inr+2]);
149     qq00             = _mm_mul_ps(iq0,jq0);
150     qq01             = _mm_mul_ps(iq0,jq1);
151     qq02             = _mm_mul_ps(iq0,jq2);
152     qq10             = _mm_mul_ps(iq1,jq0);
153     qq11             = _mm_mul_ps(iq1,jq1);
154     qq12             = _mm_mul_ps(iq1,jq2);
155     qq20             = _mm_mul_ps(iq2,jq0);
156     qq21             = _mm_mul_ps(iq2,jq1);
157     qq22             = _mm_mul_ps(iq2,jq2);
158
159     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
160     rcutoff_scalar   = fr->rcoulomb;
161     rcutoff          = _mm_set1_ps(rcutoff_scalar);
162     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
163
164     rswitch_scalar   = fr->rcoulomb_switch;
165     rswitch          = _mm_set1_ps(rswitch_scalar);
166     /* Setup switch parameters */
167     d_scalar         = rcutoff_scalar-rswitch_scalar;
168     d                = _mm_set1_ps(d_scalar);
169     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
170     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
171     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
172     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
173     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
174     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
175
176     /* Avoid stupid compiler warnings */
177     jnrA = jnrB = jnrC = jnrD = 0;
178     j_coord_offsetA = 0;
179     j_coord_offsetB = 0;
180     j_coord_offsetC = 0;
181     j_coord_offsetD = 0;
182
183     outeriter        = 0;
184     inneriter        = 0;
185
186     for(iidx=0;iidx<4*DIM;iidx++)
187     {
188         scratch[iidx] = 0.0;
189     }
190
191     /* Start outer loop over neighborlists */
192     for(iidx=0; iidx<nri; iidx++)
193     {
194         /* Load shift vector for this list */
195         i_shift_offset   = DIM*shiftidx[iidx];
196
197         /* Load limits for loop over neighbors */
198         j_index_start    = jindex[iidx];
199         j_index_end      = jindex[iidx+1];
200
201         /* Get outer coordinate index */
202         inr              = iinr[iidx];
203         i_coord_offset   = DIM*inr;
204
205         /* Load i particle coords and add shift vector */
206         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
207                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
208
209         fix0             = _mm_setzero_ps();
210         fiy0             = _mm_setzero_ps();
211         fiz0             = _mm_setzero_ps();
212         fix1             = _mm_setzero_ps();
213         fiy1             = _mm_setzero_ps();
214         fiz1             = _mm_setzero_ps();
215         fix2             = _mm_setzero_ps();
216         fiy2             = _mm_setzero_ps();
217         fiz2             = _mm_setzero_ps();
218
219         /* Reset potential sums */
220         velecsum         = _mm_setzero_ps();
221
222         /* Start inner kernel loop */
223         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
224         {
225
226             /* Get j neighbor index, and coordinate index */
227             jnrA             = jjnr[jidx];
228             jnrB             = jjnr[jidx+1];
229             jnrC             = jjnr[jidx+2];
230             jnrD             = jjnr[jidx+3];
231             j_coord_offsetA  = DIM*jnrA;
232             j_coord_offsetB  = DIM*jnrB;
233             j_coord_offsetC  = DIM*jnrC;
234             j_coord_offsetD  = DIM*jnrD;
235
236             /* load j atom coordinates */
237             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
238                                               x+j_coord_offsetC,x+j_coord_offsetD,
239                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
240
241             /* Calculate displacement vector */
242             dx00             = _mm_sub_ps(ix0,jx0);
243             dy00             = _mm_sub_ps(iy0,jy0);
244             dz00             = _mm_sub_ps(iz0,jz0);
245             dx01             = _mm_sub_ps(ix0,jx1);
246             dy01             = _mm_sub_ps(iy0,jy1);
247             dz01             = _mm_sub_ps(iz0,jz1);
248             dx02             = _mm_sub_ps(ix0,jx2);
249             dy02             = _mm_sub_ps(iy0,jy2);
250             dz02             = _mm_sub_ps(iz0,jz2);
251             dx10             = _mm_sub_ps(ix1,jx0);
252             dy10             = _mm_sub_ps(iy1,jy0);
253             dz10             = _mm_sub_ps(iz1,jz0);
254             dx11             = _mm_sub_ps(ix1,jx1);
255             dy11             = _mm_sub_ps(iy1,jy1);
256             dz11             = _mm_sub_ps(iz1,jz1);
257             dx12             = _mm_sub_ps(ix1,jx2);
258             dy12             = _mm_sub_ps(iy1,jy2);
259             dz12             = _mm_sub_ps(iz1,jz2);
260             dx20             = _mm_sub_ps(ix2,jx0);
261             dy20             = _mm_sub_ps(iy2,jy0);
262             dz20             = _mm_sub_ps(iz2,jz0);
263             dx21             = _mm_sub_ps(ix2,jx1);
264             dy21             = _mm_sub_ps(iy2,jy1);
265             dz21             = _mm_sub_ps(iz2,jz1);
266             dx22             = _mm_sub_ps(ix2,jx2);
267             dy22             = _mm_sub_ps(iy2,jy2);
268             dz22             = _mm_sub_ps(iz2,jz2);
269
270             /* Calculate squared distance and things based on it */
271             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
272             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
273             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
274             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
275             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
276             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
277             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
278             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
279             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
280
281             rinv00           = gmx_mm_invsqrt_ps(rsq00);
282             rinv01           = gmx_mm_invsqrt_ps(rsq01);
283             rinv02           = gmx_mm_invsqrt_ps(rsq02);
284             rinv10           = gmx_mm_invsqrt_ps(rsq10);
285             rinv11           = gmx_mm_invsqrt_ps(rsq11);
286             rinv12           = gmx_mm_invsqrt_ps(rsq12);
287             rinv20           = gmx_mm_invsqrt_ps(rsq20);
288             rinv21           = gmx_mm_invsqrt_ps(rsq21);
289             rinv22           = gmx_mm_invsqrt_ps(rsq22);
290
291             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
292             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
293             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
294             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
295             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
296             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
297             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
298             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
299             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
300
301             fjx0             = _mm_setzero_ps();
302             fjy0             = _mm_setzero_ps();
303             fjz0             = _mm_setzero_ps();
304             fjx1             = _mm_setzero_ps();
305             fjy1             = _mm_setzero_ps();
306             fjz1             = _mm_setzero_ps();
307             fjx2             = _mm_setzero_ps();
308             fjy2             = _mm_setzero_ps();
309             fjz2             = _mm_setzero_ps();
310
311             /**************************
312              * CALCULATE INTERACTIONS *
313              **************************/
314
315             if (gmx_mm_any_lt(rsq00,rcutoff2))
316             {
317
318             r00              = _mm_mul_ps(rsq00,rinv00);
319
320             /* EWALD ELECTROSTATICS */
321
322             /* Analytical PME correction */
323             zeta2            = _mm_mul_ps(beta2,rsq00);
324             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
325             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
326             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
327             felec            = _mm_mul_ps(qq00,felec);
328             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
329             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
330             velec            = _mm_mul_ps(qq00,velec);
331
332             d                = _mm_sub_ps(r00,rswitch);
333             d                = _mm_max_ps(d,_mm_setzero_ps());
334             d2               = _mm_mul_ps(d,d);
335             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
336
337             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
338
339             /* Evaluate switch function */
340             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
341             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
342             velec            = _mm_mul_ps(velec,sw);
343             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
344
345             /* Update potential sum for this i atom from the interaction with this j atom. */
346             velec            = _mm_and_ps(velec,cutoff_mask);
347             velecsum         = _mm_add_ps(velecsum,velec);
348
349             fscal            = felec;
350
351             fscal            = _mm_and_ps(fscal,cutoff_mask);
352
353              /* Update vectorial force */
354             fix0             = _mm_macc_ps(dx00,fscal,fix0);
355             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
356             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
357
358             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
359             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
360             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
361
362             }
363
364             /**************************
365              * CALCULATE INTERACTIONS *
366              **************************/
367
368             if (gmx_mm_any_lt(rsq01,rcutoff2))
369             {
370
371             r01              = _mm_mul_ps(rsq01,rinv01);
372
373             /* EWALD ELECTROSTATICS */
374
375             /* Analytical PME correction */
376             zeta2            = _mm_mul_ps(beta2,rsq01);
377             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
378             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
379             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
380             felec            = _mm_mul_ps(qq01,felec);
381             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
382             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
383             velec            = _mm_mul_ps(qq01,velec);
384
385             d                = _mm_sub_ps(r01,rswitch);
386             d                = _mm_max_ps(d,_mm_setzero_ps());
387             d2               = _mm_mul_ps(d,d);
388             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
389
390             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
391
392             /* Evaluate switch function */
393             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
394             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
395             velec            = _mm_mul_ps(velec,sw);
396             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
397
398             /* Update potential sum for this i atom from the interaction with this j atom. */
399             velec            = _mm_and_ps(velec,cutoff_mask);
400             velecsum         = _mm_add_ps(velecsum,velec);
401
402             fscal            = felec;
403
404             fscal            = _mm_and_ps(fscal,cutoff_mask);
405
406              /* Update vectorial force */
407             fix0             = _mm_macc_ps(dx01,fscal,fix0);
408             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
409             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
410
411             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
412             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
413             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
414
415             }
416
417             /**************************
418              * CALCULATE INTERACTIONS *
419              **************************/
420
421             if (gmx_mm_any_lt(rsq02,rcutoff2))
422             {
423
424             r02              = _mm_mul_ps(rsq02,rinv02);
425
426             /* EWALD ELECTROSTATICS */
427
428             /* Analytical PME correction */
429             zeta2            = _mm_mul_ps(beta2,rsq02);
430             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
431             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
432             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
433             felec            = _mm_mul_ps(qq02,felec);
434             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
435             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
436             velec            = _mm_mul_ps(qq02,velec);
437
438             d                = _mm_sub_ps(r02,rswitch);
439             d                = _mm_max_ps(d,_mm_setzero_ps());
440             d2               = _mm_mul_ps(d,d);
441             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
442
443             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
444
445             /* Evaluate switch function */
446             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
447             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
448             velec            = _mm_mul_ps(velec,sw);
449             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
450
451             /* Update potential sum for this i atom from the interaction with this j atom. */
452             velec            = _mm_and_ps(velec,cutoff_mask);
453             velecsum         = _mm_add_ps(velecsum,velec);
454
455             fscal            = felec;
456
457             fscal            = _mm_and_ps(fscal,cutoff_mask);
458
459              /* Update vectorial force */
460             fix0             = _mm_macc_ps(dx02,fscal,fix0);
461             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
462             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
463
464             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
465             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
466             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
467
468             }
469
470             /**************************
471              * CALCULATE INTERACTIONS *
472              **************************/
473
474             if (gmx_mm_any_lt(rsq10,rcutoff2))
475             {
476
477             r10              = _mm_mul_ps(rsq10,rinv10);
478
479             /* EWALD ELECTROSTATICS */
480
481             /* Analytical PME correction */
482             zeta2            = _mm_mul_ps(beta2,rsq10);
483             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
484             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
485             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
486             felec            = _mm_mul_ps(qq10,felec);
487             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
488             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
489             velec            = _mm_mul_ps(qq10,velec);
490
491             d                = _mm_sub_ps(r10,rswitch);
492             d                = _mm_max_ps(d,_mm_setzero_ps());
493             d2               = _mm_mul_ps(d,d);
494             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
495
496             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
497
498             /* Evaluate switch function */
499             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
500             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
501             velec            = _mm_mul_ps(velec,sw);
502             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
503
504             /* Update potential sum for this i atom from the interaction with this j atom. */
505             velec            = _mm_and_ps(velec,cutoff_mask);
506             velecsum         = _mm_add_ps(velecsum,velec);
507
508             fscal            = felec;
509
510             fscal            = _mm_and_ps(fscal,cutoff_mask);
511
512              /* Update vectorial force */
513             fix1             = _mm_macc_ps(dx10,fscal,fix1);
514             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
515             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
516
517             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
518             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
519             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
520
521             }
522
523             /**************************
524              * CALCULATE INTERACTIONS *
525              **************************/
526
527             if (gmx_mm_any_lt(rsq11,rcutoff2))
528             {
529
530             r11              = _mm_mul_ps(rsq11,rinv11);
531
532             /* EWALD ELECTROSTATICS */
533
534             /* Analytical PME correction */
535             zeta2            = _mm_mul_ps(beta2,rsq11);
536             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
537             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
538             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
539             felec            = _mm_mul_ps(qq11,felec);
540             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
541             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
542             velec            = _mm_mul_ps(qq11,velec);
543
544             d                = _mm_sub_ps(r11,rswitch);
545             d                = _mm_max_ps(d,_mm_setzero_ps());
546             d2               = _mm_mul_ps(d,d);
547             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
548
549             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
550
551             /* Evaluate switch function */
552             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
553             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
554             velec            = _mm_mul_ps(velec,sw);
555             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
556
557             /* Update potential sum for this i atom from the interaction with this j atom. */
558             velec            = _mm_and_ps(velec,cutoff_mask);
559             velecsum         = _mm_add_ps(velecsum,velec);
560
561             fscal            = felec;
562
563             fscal            = _mm_and_ps(fscal,cutoff_mask);
564
565              /* Update vectorial force */
566             fix1             = _mm_macc_ps(dx11,fscal,fix1);
567             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
568             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
569
570             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
571             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
572             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
573
574             }
575
576             /**************************
577              * CALCULATE INTERACTIONS *
578              **************************/
579
580             if (gmx_mm_any_lt(rsq12,rcutoff2))
581             {
582
583             r12              = _mm_mul_ps(rsq12,rinv12);
584
585             /* EWALD ELECTROSTATICS */
586
587             /* Analytical PME correction */
588             zeta2            = _mm_mul_ps(beta2,rsq12);
589             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
590             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
591             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
592             felec            = _mm_mul_ps(qq12,felec);
593             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
594             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
595             velec            = _mm_mul_ps(qq12,velec);
596
597             d                = _mm_sub_ps(r12,rswitch);
598             d                = _mm_max_ps(d,_mm_setzero_ps());
599             d2               = _mm_mul_ps(d,d);
600             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
601
602             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
603
604             /* Evaluate switch function */
605             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
606             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
607             velec            = _mm_mul_ps(velec,sw);
608             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
609
610             /* Update potential sum for this i atom from the interaction with this j atom. */
611             velec            = _mm_and_ps(velec,cutoff_mask);
612             velecsum         = _mm_add_ps(velecsum,velec);
613
614             fscal            = felec;
615
616             fscal            = _mm_and_ps(fscal,cutoff_mask);
617
618              /* Update vectorial force */
619             fix1             = _mm_macc_ps(dx12,fscal,fix1);
620             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
621             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
622
623             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
624             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
625             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
626
627             }
628
629             /**************************
630              * CALCULATE INTERACTIONS *
631              **************************/
632
633             if (gmx_mm_any_lt(rsq20,rcutoff2))
634             {
635
636             r20              = _mm_mul_ps(rsq20,rinv20);
637
638             /* EWALD ELECTROSTATICS */
639
640             /* Analytical PME correction */
641             zeta2            = _mm_mul_ps(beta2,rsq20);
642             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
643             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
644             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
645             felec            = _mm_mul_ps(qq20,felec);
646             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
647             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
648             velec            = _mm_mul_ps(qq20,velec);
649
650             d                = _mm_sub_ps(r20,rswitch);
651             d                = _mm_max_ps(d,_mm_setzero_ps());
652             d2               = _mm_mul_ps(d,d);
653             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
654
655             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
656
657             /* Evaluate switch function */
658             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
659             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
660             velec            = _mm_mul_ps(velec,sw);
661             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
662
663             /* Update potential sum for this i atom from the interaction with this j atom. */
664             velec            = _mm_and_ps(velec,cutoff_mask);
665             velecsum         = _mm_add_ps(velecsum,velec);
666
667             fscal            = felec;
668
669             fscal            = _mm_and_ps(fscal,cutoff_mask);
670
671              /* Update vectorial force */
672             fix2             = _mm_macc_ps(dx20,fscal,fix2);
673             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
674             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
675
676             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
677             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
678             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
679
680             }
681
682             /**************************
683              * CALCULATE INTERACTIONS *
684              **************************/
685
686             if (gmx_mm_any_lt(rsq21,rcutoff2))
687             {
688
689             r21              = _mm_mul_ps(rsq21,rinv21);
690
691             /* EWALD ELECTROSTATICS */
692
693             /* Analytical PME correction */
694             zeta2            = _mm_mul_ps(beta2,rsq21);
695             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
696             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
697             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
698             felec            = _mm_mul_ps(qq21,felec);
699             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
700             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
701             velec            = _mm_mul_ps(qq21,velec);
702
703             d                = _mm_sub_ps(r21,rswitch);
704             d                = _mm_max_ps(d,_mm_setzero_ps());
705             d2               = _mm_mul_ps(d,d);
706             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
707
708             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
709
710             /* Evaluate switch function */
711             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
712             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
713             velec            = _mm_mul_ps(velec,sw);
714             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
715
716             /* Update potential sum for this i atom from the interaction with this j atom. */
717             velec            = _mm_and_ps(velec,cutoff_mask);
718             velecsum         = _mm_add_ps(velecsum,velec);
719
720             fscal            = felec;
721
722             fscal            = _mm_and_ps(fscal,cutoff_mask);
723
724              /* Update vectorial force */
725             fix2             = _mm_macc_ps(dx21,fscal,fix2);
726             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
727             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
728
729             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
730             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
731             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
732
733             }
734
735             /**************************
736              * CALCULATE INTERACTIONS *
737              **************************/
738
739             if (gmx_mm_any_lt(rsq22,rcutoff2))
740             {
741
742             r22              = _mm_mul_ps(rsq22,rinv22);
743
744             /* EWALD ELECTROSTATICS */
745
746             /* Analytical PME correction */
747             zeta2            = _mm_mul_ps(beta2,rsq22);
748             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
749             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
750             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
751             felec            = _mm_mul_ps(qq22,felec);
752             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
753             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
754             velec            = _mm_mul_ps(qq22,velec);
755
756             d                = _mm_sub_ps(r22,rswitch);
757             d                = _mm_max_ps(d,_mm_setzero_ps());
758             d2               = _mm_mul_ps(d,d);
759             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
760
761             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
762
763             /* Evaluate switch function */
764             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
765             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
766             velec            = _mm_mul_ps(velec,sw);
767             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
768
769             /* Update potential sum for this i atom from the interaction with this j atom. */
770             velec            = _mm_and_ps(velec,cutoff_mask);
771             velecsum         = _mm_add_ps(velecsum,velec);
772
773             fscal            = felec;
774
775             fscal            = _mm_and_ps(fscal,cutoff_mask);
776
777              /* Update vectorial force */
778             fix2             = _mm_macc_ps(dx22,fscal,fix2);
779             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
780             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
781
782             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
783             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
784             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
785
786             }
787
788             fjptrA             = f+j_coord_offsetA;
789             fjptrB             = f+j_coord_offsetB;
790             fjptrC             = f+j_coord_offsetC;
791             fjptrD             = f+j_coord_offsetD;
792
793             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
794                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
795
796             /* Inner loop uses 477 flops */
797         }
798
799         if(jidx<j_index_end)
800         {
801
802             /* Get j neighbor index, and coordinate index */
803             jnrlistA         = jjnr[jidx];
804             jnrlistB         = jjnr[jidx+1];
805             jnrlistC         = jjnr[jidx+2];
806             jnrlistD         = jjnr[jidx+3];
807             /* Sign of each element will be negative for non-real atoms.
808              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
809              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
810              */
811             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
812             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
813             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
814             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
815             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
816             j_coord_offsetA  = DIM*jnrA;
817             j_coord_offsetB  = DIM*jnrB;
818             j_coord_offsetC  = DIM*jnrC;
819             j_coord_offsetD  = DIM*jnrD;
820
821             /* load j atom coordinates */
822             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
823                                               x+j_coord_offsetC,x+j_coord_offsetD,
824                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
825
826             /* Calculate displacement vector */
827             dx00             = _mm_sub_ps(ix0,jx0);
828             dy00             = _mm_sub_ps(iy0,jy0);
829             dz00             = _mm_sub_ps(iz0,jz0);
830             dx01             = _mm_sub_ps(ix0,jx1);
831             dy01             = _mm_sub_ps(iy0,jy1);
832             dz01             = _mm_sub_ps(iz0,jz1);
833             dx02             = _mm_sub_ps(ix0,jx2);
834             dy02             = _mm_sub_ps(iy0,jy2);
835             dz02             = _mm_sub_ps(iz0,jz2);
836             dx10             = _mm_sub_ps(ix1,jx0);
837             dy10             = _mm_sub_ps(iy1,jy0);
838             dz10             = _mm_sub_ps(iz1,jz0);
839             dx11             = _mm_sub_ps(ix1,jx1);
840             dy11             = _mm_sub_ps(iy1,jy1);
841             dz11             = _mm_sub_ps(iz1,jz1);
842             dx12             = _mm_sub_ps(ix1,jx2);
843             dy12             = _mm_sub_ps(iy1,jy2);
844             dz12             = _mm_sub_ps(iz1,jz2);
845             dx20             = _mm_sub_ps(ix2,jx0);
846             dy20             = _mm_sub_ps(iy2,jy0);
847             dz20             = _mm_sub_ps(iz2,jz0);
848             dx21             = _mm_sub_ps(ix2,jx1);
849             dy21             = _mm_sub_ps(iy2,jy1);
850             dz21             = _mm_sub_ps(iz2,jz1);
851             dx22             = _mm_sub_ps(ix2,jx2);
852             dy22             = _mm_sub_ps(iy2,jy2);
853             dz22             = _mm_sub_ps(iz2,jz2);
854
855             /* Calculate squared distance and things based on it */
856             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
857             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
858             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
859             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
860             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
861             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
862             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
863             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
864             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
865
866             rinv00           = gmx_mm_invsqrt_ps(rsq00);
867             rinv01           = gmx_mm_invsqrt_ps(rsq01);
868             rinv02           = gmx_mm_invsqrt_ps(rsq02);
869             rinv10           = gmx_mm_invsqrt_ps(rsq10);
870             rinv11           = gmx_mm_invsqrt_ps(rsq11);
871             rinv12           = gmx_mm_invsqrt_ps(rsq12);
872             rinv20           = gmx_mm_invsqrt_ps(rsq20);
873             rinv21           = gmx_mm_invsqrt_ps(rsq21);
874             rinv22           = gmx_mm_invsqrt_ps(rsq22);
875
876             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
877             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
878             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
879             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
880             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
881             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
882             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
883             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
884             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
885
886             fjx0             = _mm_setzero_ps();
887             fjy0             = _mm_setzero_ps();
888             fjz0             = _mm_setzero_ps();
889             fjx1             = _mm_setzero_ps();
890             fjy1             = _mm_setzero_ps();
891             fjz1             = _mm_setzero_ps();
892             fjx2             = _mm_setzero_ps();
893             fjy2             = _mm_setzero_ps();
894             fjz2             = _mm_setzero_ps();
895
896             /**************************
897              * CALCULATE INTERACTIONS *
898              **************************/
899
900             if (gmx_mm_any_lt(rsq00,rcutoff2))
901             {
902
903             r00              = _mm_mul_ps(rsq00,rinv00);
904             r00              = _mm_andnot_ps(dummy_mask,r00);
905
906             /* EWALD ELECTROSTATICS */
907
908             /* Analytical PME correction */
909             zeta2            = _mm_mul_ps(beta2,rsq00);
910             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
911             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
912             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
913             felec            = _mm_mul_ps(qq00,felec);
914             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
915             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
916             velec            = _mm_mul_ps(qq00,velec);
917
918             d                = _mm_sub_ps(r00,rswitch);
919             d                = _mm_max_ps(d,_mm_setzero_ps());
920             d2               = _mm_mul_ps(d,d);
921             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
922
923             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
924
925             /* Evaluate switch function */
926             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
927             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
928             velec            = _mm_mul_ps(velec,sw);
929             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
930
931             /* Update potential sum for this i atom from the interaction with this j atom. */
932             velec            = _mm_and_ps(velec,cutoff_mask);
933             velec            = _mm_andnot_ps(dummy_mask,velec);
934             velecsum         = _mm_add_ps(velecsum,velec);
935
936             fscal            = felec;
937
938             fscal            = _mm_and_ps(fscal,cutoff_mask);
939
940             fscal            = _mm_andnot_ps(dummy_mask,fscal);
941
942              /* Update vectorial force */
943             fix0             = _mm_macc_ps(dx00,fscal,fix0);
944             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
945             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
946
947             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
948             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
949             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
950
951             }
952
953             /**************************
954              * CALCULATE INTERACTIONS *
955              **************************/
956
957             if (gmx_mm_any_lt(rsq01,rcutoff2))
958             {
959
960             r01              = _mm_mul_ps(rsq01,rinv01);
961             r01              = _mm_andnot_ps(dummy_mask,r01);
962
963             /* EWALD ELECTROSTATICS */
964
965             /* Analytical PME correction */
966             zeta2            = _mm_mul_ps(beta2,rsq01);
967             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
968             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
969             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
970             felec            = _mm_mul_ps(qq01,felec);
971             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
972             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
973             velec            = _mm_mul_ps(qq01,velec);
974
975             d                = _mm_sub_ps(r01,rswitch);
976             d                = _mm_max_ps(d,_mm_setzero_ps());
977             d2               = _mm_mul_ps(d,d);
978             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
979
980             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
981
982             /* Evaluate switch function */
983             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
984             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
985             velec            = _mm_mul_ps(velec,sw);
986             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
987
988             /* Update potential sum for this i atom from the interaction with this j atom. */
989             velec            = _mm_and_ps(velec,cutoff_mask);
990             velec            = _mm_andnot_ps(dummy_mask,velec);
991             velecsum         = _mm_add_ps(velecsum,velec);
992
993             fscal            = felec;
994
995             fscal            = _mm_and_ps(fscal,cutoff_mask);
996
997             fscal            = _mm_andnot_ps(dummy_mask,fscal);
998
999              /* Update vectorial force */
1000             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1001             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1002             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1003
1004             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1005             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1006             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1007
1008             }
1009
1010             /**************************
1011              * CALCULATE INTERACTIONS *
1012              **************************/
1013
1014             if (gmx_mm_any_lt(rsq02,rcutoff2))
1015             {
1016
1017             r02              = _mm_mul_ps(rsq02,rinv02);
1018             r02              = _mm_andnot_ps(dummy_mask,r02);
1019
1020             /* EWALD ELECTROSTATICS */
1021
1022             /* Analytical PME correction */
1023             zeta2            = _mm_mul_ps(beta2,rsq02);
1024             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
1025             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1026             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1027             felec            = _mm_mul_ps(qq02,felec);
1028             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1029             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
1030             velec            = _mm_mul_ps(qq02,velec);
1031
1032             d                = _mm_sub_ps(r02,rswitch);
1033             d                = _mm_max_ps(d,_mm_setzero_ps());
1034             d2               = _mm_mul_ps(d,d);
1035             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1036
1037             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1038
1039             /* Evaluate switch function */
1040             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1041             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1042             velec            = _mm_mul_ps(velec,sw);
1043             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1044
1045             /* Update potential sum for this i atom from the interaction with this j atom. */
1046             velec            = _mm_and_ps(velec,cutoff_mask);
1047             velec            = _mm_andnot_ps(dummy_mask,velec);
1048             velecsum         = _mm_add_ps(velecsum,velec);
1049
1050             fscal            = felec;
1051
1052             fscal            = _mm_and_ps(fscal,cutoff_mask);
1053
1054             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1055
1056              /* Update vectorial force */
1057             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1058             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1059             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1060
1061             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1062             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1063             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1064
1065             }
1066
1067             /**************************
1068              * CALCULATE INTERACTIONS *
1069              **************************/
1070
1071             if (gmx_mm_any_lt(rsq10,rcutoff2))
1072             {
1073
1074             r10              = _mm_mul_ps(rsq10,rinv10);
1075             r10              = _mm_andnot_ps(dummy_mask,r10);
1076
1077             /* EWALD ELECTROSTATICS */
1078
1079             /* Analytical PME correction */
1080             zeta2            = _mm_mul_ps(beta2,rsq10);
1081             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1082             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1083             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1084             felec            = _mm_mul_ps(qq10,felec);
1085             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1086             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
1087             velec            = _mm_mul_ps(qq10,velec);
1088
1089             d                = _mm_sub_ps(r10,rswitch);
1090             d                = _mm_max_ps(d,_mm_setzero_ps());
1091             d2               = _mm_mul_ps(d,d);
1092             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1093
1094             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1095
1096             /* Evaluate switch function */
1097             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1098             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1099             velec            = _mm_mul_ps(velec,sw);
1100             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1101
1102             /* Update potential sum for this i atom from the interaction with this j atom. */
1103             velec            = _mm_and_ps(velec,cutoff_mask);
1104             velec            = _mm_andnot_ps(dummy_mask,velec);
1105             velecsum         = _mm_add_ps(velecsum,velec);
1106
1107             fscal            = felec;
1108
1109             fscal            = _mm_and_ps(fscal,cutoff_mask);
1110
1111             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1112
1113              /* Update vectorial force */
1114             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1115             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1116             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1117
1118             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1119             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1120             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1121
1122             }
1123
1124             /**************************
1125              * CALCULATE INTERACTIONS *
1126              **************************/
1127
1128             if (gmx_mm_any_lt(rsq11,rcutoff2))
1129             {
1130
1131             r11              = _mm_mul_ps(rsq11,rinv11);
1132             r11              = _mm_andnot_ps(dummy_mask,r11);
1133
1134             /* EWALD ELECTROSTATICS */
1135
1136             /* Analytical PME correction */
1137             zeta2            = _mm_mul_ps(beta2,rsq11);
1138             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1139             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1140             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1141             felec            = _mm_mul_ps(qq11,felec);
1142             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1143             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1144             velec            = _mm_mul_ps(qq11,velec);
1145
1146             d                = _mm_sub_ps(r11,rswitch);
1147             d                = _mm_max_ps(d,_mm_setzero_ps());
1148             d2               = _mm_mul_ps(d,d);
1149             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1150
1151             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1152
1153             /* Evaluate switch function */
1154             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1155             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1156             velec            = _mm_mul_ps(velec,sw);
1157             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1158
1159             /* Update potential sum for this i atom from the interaction with this j atom. */
1160             velec            = _mm_and_ps(velec,cutoff_mask);
1161             velec            = _mm_andnot_ps(dummy_mask,velec);
1162             velecsum         = _mm_add_ps(velecsum,velec);
1163
1164             fscal            = felec;
1165
1166             fscal            = _mm_and_ps(fscal,cutoff_mask);
1167
1168             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1169
1170              /* Update vectorial force */
1171             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1172             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1173             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1174
1175             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1176             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1177             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1178
1179             }
1180
1181             /**************************
1182              * CALCULATE INTERACTIONS *
1183              **************************/
1184
1185             if (gmx_mm_any_lt(rsq12,rcutoff2))
1186             {
1187
1188             r12              = _mm_mul_ps(rsq12,rinv12);
1189             r12              = _mm_andnot_ps(dummy_mask,r12);
1190
1191             /* EWALD ELECTROSTATICS */
1192
1193             /* Analytical PME correction */
1194             zeta2            = _mm_mul_ps(beta2,rsq12);
1195             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1196             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1197             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1198             felec            = _mm_mul_ps(qq12,felec);
1199             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1200             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
1201             velec            = _mm_mul_ps(qq12,velec);
1202
1203             d                = _mm_sub_ps(r12,rswitch);
1204             d                = _mm_max_ps(d,_mm_setzero_ps());
1205             d2               = _mm_mul_ps(d,d);
1206             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1207
1208             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1209
1210             /* Evaluate switch function */
1211             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1212             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1213             velec            = _mm_mul_ps(velec,sw);
1214             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1215
1216             /* Update potential sum for this i atom from the interaction with this j atom. */
1217             velec            = _mm_and_ps(velec,cutoff_mask);
1218             velec            = _mm_andnot_ps(dummy_mask,velec);
1219             velecsum         = _mm_add_ps(velecsum,velec);
1220
1221             fscal            = felec;
1222
1223             fscal            = _mm_and_ps(fscal,cutoff_mask);
1224
1225             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1226
1227              /* Update vectorial force */
1228             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1229             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1230             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1231
1232             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1233             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1234             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1235
1236             }
1237
1238             /**************************
1239              * CALCULATE INTERACTIONS *
1240              **************************/
1241
1242             if (gmx_mm_any_lt(rsq20,rcutoff2))
1243             {
1244
1245             r20              = _mm_mul_ps(rsq20,rinv20);
1246             r20              = _mm_andnot_ps(dummy_mask,r20);
1247
1248             /* EWALD ELECTROSTATICS */
1249
1250             /* Analytical PME correction */
1251             zeta2            = _mm_mul_ps(beta2,rsq20);
1252             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
1253             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1254             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1255             felec            = _mm_mul_ps(qq20,felec);
1256             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1257             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
1258             velec            = _mm_mul_ps(qq20,velec);
1259
1260             d                = _mm_sub_ps(r20,rswitch);
1261             d                = _mm_max_ps(d,_mm_setzero_ps());
1262             d2               = _mm_mul_ps(d,d);
1263             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1264
1265             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1266
1267             /* Evaluate switch function */
1268             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1269             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1270             velec            = _mm_mul_ps(velec,sw);
1271             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1272
1273             /* Update potential sum for this i atom from the interaction with this j atom. */
1274             velec            = _mm_and_ps(velec,cutoff_mask);
1275             velec            = _mm_andnot_ps(dummy_mask,velec);
1276             velecsum         = _mm_add_ps(velecsum,velec);
1277
1278             fscal            = felec;
1279
1280             fscal            = _mm_and_ps(fscal,cutoff_mask);
1281
1282             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1283
1284              /* Update vectorial force */
1285             fix2             = _mm_macc_ps(dx20,fscal,fix2);
1286             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
1287             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
1288
1289             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
1290             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
1291             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
1292
1293             }
1294
1295             /**************************
1296              * CALCULATE INTERACTIONS *
1297              **************************/
1298
1299             if (gmx_mm_any_lt(rsq21,rcutoff2))
1300             {
1301
1302             r21              = _mm_mul_ps(rsq21,rinv21);
1303             r21              = _mm_andnot_ps(dummy_mask,r21);
1304
1305             /* EWALD ELECTROSTATICS */
1306
1307             /* Analytical PME correction */
1308             zeta2            = _mm_mul_ps(beta2,rsq21);
1309             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
1310             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1311             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1312             felec            = _mm_mul_ps(qq21,felec);
1313             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1314             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
1315             velec            = _mm_mul_ps(qq21,velec);
1316
1317             d                = _mm_sub_ps(r21,rswitch);
1318             d                = _mm_max_ps(d,_mm_setzero_ps());
1319             d2               = _mm_mul_ps(d,d);
1320             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1321
1322             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1323
1324             /* Evaluate switch function */
1325             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1326             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1327             velec            = _mm_mul_ps(velec,sw);
1328             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1329
1330             /* Update potential sum for this i atom from the interaction with this j atom. */
1331             velec            = _mm_and_ps(velec,cutoff_mask);
1332             velec            = _mm_andnot_ps(dummy_mask,velec);
1333             velecsum         = _mm_add_ps(velecsum,velec);
1334
1335             fscal            = felec;
1336
1337             fscal            = _mm_and_ps(fscal,cutoff_mask);
1338
1339             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1340
1341              /* Update vectorial force */
1342             fix2             = _mm_macc_ps(dx21,fscal,fix2);
1343             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
1344             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
1345
1346             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
1347             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
1348             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
1349
1350             }
1351
1352             /**************************
1353              * CALCULATE INTERACTIONS *
1354              **************************/
1355
1356             if (gmx_mm_any_lt(rsq22,rcutoff2))
1357             {
1358
1359             r22              = _mm_mul_ps(rsq22,rinv22);
1360             r22              = _mm_andnot_ps(dummy_mask,r22);
1361
1362             /* EWALD ELECTROSTATICS */
1363
1364             /* Analytical PME correction */
1365             zeta2            = _mm_mul_ps(beta2,rsq22);
1366             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
1367             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1368             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1369             felec            = _mm_mul_ps(qq22,felec);
1370             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1371             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
1372             velec            = _mm_mul_ps(qq22,velec);
1373
1374             d                = _mm_sub_ps(r22,rswitch);
1375             d                = _mm_max_ps(d,_mm_setzero_ps());
1376             d2               = _mm_mul_ps(d,d);
1377             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1378
1379             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1380
1381             /* Evaluate switch function */
1382             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1383             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1384             velec            = _mm_mul_ps(velec,sw);
1385             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1386
1387             /* Update potential sum for this i atom from the interaction with this j atom. */
1388             velec            = _mm_and_ps(velec,cutoff_mask);
1389             velec            = _mm_andnot_ps(dummy_mask,velec);
1390             velecsum         = _mm_add_ps(velecsum,velec);
1391
1392             fscal            = felec;
1393
1394             fscal            = _mm_and_ps(fscal,cutoff_mask);
1395
1396             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1397
1398              /* Update vectorial force */
1399             fix2             = _mm_macc_ps(dx22,fscal,fix2);
1400             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
1401             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
1402
1403             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
1404             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
1405             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
1406
1407             }
1408
1409             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1410             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1411             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1412             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1413
1414             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1415                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1416
1417             /* Inner loop uses 486 flops */
1418         }
1419
1420         /* End of innermost loop */
1421
1422         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1423                                               f+i_coord_offset,fshift+i_shift_offset);
1424
1425         ggid                        = gid[iidx];
1426         /* Update potential energies */
1427         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1428
1429         /* Increment number of inner iterations */
1430         inneriter                  += j_index_end - j_index_start;
1431
1432         /* Outer loop uses 19 flops */
1433     }
1434
1435     /* Increment number of outer iterations */
1436     outeriter        += nri;
1437
1438     /* Update outer/inner flops */
1439
1440     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*486);
1441 }
1442 /*
1443  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_128_fma_single
1444  * Electrostatics interaction: Ewald
1445  * VdW interaction:            None
1446  * Geometry:                   Water3-Water3
1447  * Calculate force/pot:        Force
1448  */
1449 void
1450 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_128_fma_single
1451                     (t_nblist                    * gmx_restrict       nlist,
1452                      rvec                        * gmx_restrict          xx,
1453                      rvec                        * gmx_restrict          ff,
1454                      t_forcerec                  * gmx_restrict          fr,
1455                      t_mdatoms                   * gmx_restrict     mdatoms,
1456                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1457                      t_nrnb                      * gmx_restrict        nrnb)
1458 {
1459     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1460      * just 0 for non-waters.
1461      * Suffixes A,B,C,D refer to j loop unrolling done with AVX_128, e.g. for the four different
1462      * jnr indices corresponding to data put in the four positions in the SIMD register.
1463      */
1464     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1465     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1466     int              jnrA,jnrB,jnrC,jnrD;
1467     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1468     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1469     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1470     real             rcutoff_scalar;
1471     real             *shiftvec,*fshift,*x,*f;
1472     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1473     real             scratch[4*DIM];
1474     __m128           fscal,rcutoff,rcutoff2,jidxall;
1475     int              vdwioffset0;
1476     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1477     int              vdwioffset1;
1478     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1479     int              vdwioffset2;
1480     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1481     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1482     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1483     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1484     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1485     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1486     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1487     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1488     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1489     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1490     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1491     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1492     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1493     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1494     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1495     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1496     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1497     real             *charge;
1498     __m128i          ewitab;
1499     __m128           ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1500     __m128           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1501     real             *ewtab;
1502     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1503     real             rswitch_scalar,d_scalar;
1504     __m128           dummy_mask,cutoff_mask;
1505     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1506     __m128           one     = _mm_set1_ps(1.0);
1507     __m128           two     = _mm_set1_ps(2.0);
1508     x                = xx[0];
1509     f                = ff[0];
1510
1511     nri              = nlist->nri;
1512     iinr             = nlist->iinr;
1513     jindex           = nlist->jindex;
1514     jjnr             = nlist->jjnr;
1515     shiftidx         = nlist->shift;
1516     gid              = nlist->gid;
1517     shiftvec         = fr->shift_vec[0];
1518     fshift           = fr->fshift[0];
1519     facel            = _mm_set1_ps(fr->epsfac);
1520     charge           = mdatoms->chargeA;
1521
1522     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1523     beta             = _mm_set1_ps(fr->ic->ewaldcoeff);
1524     beta2            = _mm_mul_ps(beta,beta);
1525     beta3            = _mm_mul_ps(beta,beta2);
1526     ewtab            = fr->ic->tabq_coul_FDV0;
1527     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1528     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1529
1530     /* Setup water-specific parameters */
1531     inr              = nlist->iinr[0];
1532     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1533     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1534     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1535
1536     jq0              = _mm_set1_ps(charge[inr+0]);
1537     jq1              = _mm_set1_ps(charge[inr+1]);
1538     jq2              = _mm_set1_ps(charge[inr+2]);
1539     qq00             = _mm_mul_ps(iq0,jq0);
1540     qq01             = _mm_mul_ps(iq0,jq1);
1541     qq02             = _mm_mul_ps(iq0,jq2);
1542     qq10             = _mm_mul_ps(iq1,jq0);
1543     qq11             = _mm_mul_ps(iq1,jq1);
1544     qq12             = _mm_mul_ps(iq1,jq2);
1545     qq20             = _mm_mul_ps(iq2,jq0);
1546     qq21             = _mm_mul_ps(iq2,jq1);
1547     qq22             = _mm_mul_ps(iq2,jq2);
1548
1549     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1550     rcutoff_scalar   = fr->rcoulomb;
1551     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1552     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1553
1554     rswitch_scalar   = fr->rcoulomb_switch;
1555     rswitch          = _mm_set1_ps(rswitch_scalar);
1556     /* Setup switch parameters */
1557     d_scalar         = rcutoff_scalar-rswitch_scalar;
1558     d                = _mm_set1_ps(d_scalar);
1559     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1560     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1561     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1562     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1563     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1564     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1565
1566     /* Avoid stupid compiler warnings */
1567     jnrA = jnrB = jnrC = jnrD = 0;
1568     j_coord_offsetA = 0;
1569     j_coord_offsetB = 0;
1570     j_coord_offsetC = 0;
1571     j_coord_offsetD = 0;
1572
1573     outeriter        = 0;
1574     inneriter        = 0;
1575
1576     for(iidx=0;iidx<4*DIM;iidx++)
1577     {
1578         scratch[iidx] = 0.0;
1579     }
1580
1581     /* Start outer loop over neighborlists */
1582     for(iidx=0; iidx<nri; iidx++)
1583     {
1584         /* Load shift vector for this list */
1585         i_shift_offset   = DIM*shiftidx[iidx];
1586
1587         /* Load limits for loop over neighbors */
1588         j_index_start    = jindex[iidx];
1589         j_index_end      = jindex[iidx+1];
1590
1591         /* Get outer coordinate index */
1592         inr              = iinr[iidx];
1593         i_coord_offset   = DIM*inr;
1594
1595         /* Load i particle coords and add shift vector */
1596         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1597                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1598
1599         fix0             = _mm_setzero_ps();
1600         fiy0             = _mm_setzero_ps();
1601         fiz0             = _mm_setzero_ps();
1602         fix1             = _mm_setzero_ps();
1603         fiy1             = _mm_setzero_ps();
1604         fiz1             = _mm_setzero_ps();
1605         fix2             = _mm_setzero_ps();
1606         fiy2             = _mm_setzero_ps();
1607         fiz2             = _mm_setzero_ps();
1608
1609         /* Start inner kernel loop */
1610         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1611         {
1612
1613             /* Get j neighbor index, and coordinate index */
1614             jnrA             = jjnr[jidx];
1615             jnrB             = jjnr[jidx+1];
1616             jnrC             = jjnr[jidx+2];
1617             jnrD             = jjnr[jidx+3];
1618             j_coord_offsetA  = DIM*jnrA;
1619             j_coord_offsetB  = DIM*jnrB;
1620             j_coord_offsetC  = DIM*jnrC;
1621             j_coord_offsetD  = DIM*jnrD;
1622
1623             /* load j atom coordinates */
1624             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1625                                               x+j_coord_offsetC,x+j_coord_offsetD,
1626                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1627
1628             /* Calculate displacement vector */
1629             dx00             = _mm_sub_ps(ix0,jx0);
1630             dy00             = _mm_sub_ps(iy0,jy0);
1631             dz00             = _mm_sub_ps(iz0,jz0);
1632             dx01             = _mm_sub_ps(ix0,jx1);
1633             dy01             = _mm_sub_ps(iy0,jy1);
1634             dz01             = _mm_sub_ps(iz0,jz1);
1635             dx02             = _mm_sub_ps(ix0,jx2);
1636             dy02             = _mm_sub_ps(iy0,jy2);
1637             dz02             = _mm_sub_ps(iz0,jz2);
1638             dx10             = _mm_sub_ps(ix1,jx0);
1639             dy10             = _mm_sub_ps(iy1,jy0);
1640             dz10             = _mm_sub_ps(iz1,jz0);
1641             dx11             = _mm_sub_ps(ix1,jx1);
1642             dy11             = _mm_sub_ps(iy1,jy1);
1643             dz11             = _mm_sub_ps(iz1,jz1);
1644             dx12             = _mm_sub_ps(ix1,jx2);
1645             dy12             = _mm_sub_ps(iy1,jy2);
1646             dz12             = _mm_sub_ps(iz1,jz2);
1647             dx20             = _mm_sub_ps(ix2,jx0);
1648             dy20             = _mm_sub_ps(iy2,jy0);
1649             dz20             = _mm_sub_ps(iz2,jz0);
1650             dx21             = _mm_sub_ps(ix2,jx1);
1651             dy21             = _mm_sub_ps(iy2,jy1);
1652             dz21             = _mm_sub_ps(iz2,jz1);
1653             dx22             = _mm_sub_ps(ix2,jx2);
1654             dy22             = _mm_sub_ps(iy2,jy2);
1655             dz22             = _mm_sub_ps(iz2,jz2);
1656
1657             /* Calculate squared distance and things based on it */
1658             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1659             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1660             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1661             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1662             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1663             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1664             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1665             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1666             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1667
1668             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1669             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1670             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1671             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1672             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1673             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1674             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1675             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1676             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1677
1678             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1679             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1680             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1681             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1682             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1683             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1684             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1685             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1686             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1687
1688             fjx0             = _mm_setzero_ps();
1689             fjy0             = _mm_setzero_ps();
1690             fjz0             = _mm_setzero_ps();
1691             fjx1             = _mm_setzero_ps();
1692             fjy1             = _mm_setzero_ps();
1693             fjz1             = _mm_setzero_ps();
1694             fjx2             = _mm_setzero_ps();
1695             fjy2             = _mm_setzero_ps();
1696             fjz2             = _mm_setzero_ps();
1697
1698             /**************************
1699              * CALCULATE INTERACTIONS *
1700              **************************/
1701
1702             if (gmx_mm_any_lt(rsq00,rcutoff2))
1703             {
1704
1705             r00              = _mm_mul_ps(rsq00,rinv00);
1706
1707             /* EWALD ELECTROSTATICS */
1708
1709             /* Analytical PME correction */
1710             zeta2            = _mm_mul_ps(beta2,rsq00);
1711             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
1712             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1713             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1714             felec            = _mm_mul_ps(qq00,felec);
1715             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1716             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
1717             velec            = _mm_mul_ps(qq00,velec);
1718
1719             d                = _mm_sub_ps(r00,rswitch);
1720             d                = _mm_max_ps(d,_mm_setzero_ps());
1721             d2               = _mm_mul_ps(d,d);
1722             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1723
1724             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1725
1726             /* Evaluate switch function */
1727             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1728             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1729             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1730
1731             fscal            = felec;
1732
1733             fscal            = _mm_and_ps(fscal,cutoff_mask);
1734
1735              /* Update vectorial force */
1736             fix0             = _mm_macc_ps(dx00,fscal,fix0);
1737             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
1738             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
1739
1740             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
1741             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
1742             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
1743
1744             }
1745
1746             /**************************
1747              * CALCULATE INTERACTIONS *
1748              **************************/
1749
1750             if (gmx_mm_any_lt(rsq01,rcutoff2))
1751             {
1752
1753             r01              = _mm_mul_ps(rsq01,rinv01);
1754
1755             /* EWALD ELECTROSTATICS */
1756
1757             /* Analytical PME correction */
1758             zeta2            = _mm_mul_ps(beta2,rsq01);
1759             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
1760             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1761             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1762             felec            = _mm_mul_ps(qq01,felec);
1763             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1764             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
1765             velec            = _mm_mul_ps(qq01,velec);
1766
1767             d                = _mm_sub_ps(r01,rswitch);
1768             d                = _mm_max_ps(d,_mm_setzero_ps());
1769             d2               = _mm_mul_ps(d,d);
1770             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1771
1772             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1773
1774             /* Evaluate switch function */
1775             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1776             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1777             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1778
1779             fscal            = felec;
1780
1781             fscal            = _mm_and_ps(fscal,cutoff_mask);
1782
1783              /* Update vectorial force */
1784             fix0             = _mm_macc_ps(dx01,fscal,fix0);
1785             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
1786             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
1787
1788             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
1789             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
1790             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
1791
1792             }
1793
1794             /**************************
1795              * CALCULATE INTERACTIONS *
1796              **************************/
1797
1798             if (gmx_mm_any_lt(rsq02,rcutoff2))
1799             {
1800
1801             r02              = _mm_mul_ps(rsq02,rinv02);
1802
1803             /* EWALD ELECTROSTATICS */
1804
1805             /* Analytical PME correction */
1806             zeta2            = _mm_mul_ps(beta2,rsq02);
1807             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
1808             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1809             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1810             felec            = _mm_mul_ps(qq02,felec);
1811             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1812             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
1813             velec            = _mm_mul_ps(qq02,velec);
1814
1815             d                = _mm_sub_ps(r02,rswitch);
1816             d                = _mm_max_ps(d,_mm_setzero_ps());
1817             d2               = _mm_mul_ps(d,d);
1818             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1819
1820             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1821
1822             /* Evaluate switch function */
1823             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1824             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1825             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1826
1827             fscal            = felec;
1828
1829             fscal            = _mm_and_ps(fscal,cutoff_mask);
1830
1831              /* Update vectorial force */
1832             fix0             = _mm_macc_ps(dx02,fscal,fix0);
1833             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
1834             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
1835
1836             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
1837             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
1838             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
1839
1840             }
1841
1842             /**************************
1843              * CALCULATE INTERACTIONS *
1844              **************************/
1845
1846             if (gmx_mm_any_lt(rsq10,rcutoff2))
1847             {
1848
1849             r10              = _mm_mul_ps(rsq10,rinv10);
1850
1851             /* EWALD ELECTROSTATICS */
1852
1853             /* Analytical PME correction */
1854             zeta2            = _mm_mul_ps(beta2,rsq10);
1855             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
1856             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1857             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1858             felec            = _mm_mul_ps(qq10,felec);
1859             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1860             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
1861             velec            = _mm_mul_ps(qq10,velec);
1862
1863             d                = _mm_sub_ps(r10,rswitch);
1864             d                = _mm_max_ps(d,_mm_setzero_ps());
1865             d2               = _mm_mul_ps(d,d);
1866             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1867
1868             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1869
1870             /* Evaluate switch function */
1871             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1872             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1873             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1874
1875             fscal            = felec;
1876
1877             fscal            = _mm_and_ps(fscal,cutoff_mask);
1878
1879              /* Update vectorial force */
1880             fix1             = _mm_macc_ps(dx10,fscal,fix1);
1881             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
1882             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
1883
1884             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
1885             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
1886             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
1887
1888             }
1889
1890             /**************************
1891              * CALCULATE INTERACTIONS *
1892              **************************/
1893
1894             if (gmx_mm_any_lt(rsq11,rcutoff2))
1895             {
1896
1897             r11              = _mm_mul_ps(rsq11,rinv11);
1898
1899             /* EWALD ELECTROSTATICS */
1900
1901             /* Analytical PME correction */
1902             zeta2            = _mm_mul_ps(beta2,rsq11);
1903             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
1904             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1905             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1906             felec            = _mm_mul_ps(qq11,felec);
1907             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1908             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
1909             velec            = _mm_mul_ps(qq11,velec);
1910
1911             d                = _mm_sub_ps(r11,rswitch);
1912             d                = _mm_max_ps(d,_mm_setzero_ps());
1913             d2               = _mm_mul_ps(d,d);
1914             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1915
1916             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1917
1918             /* Evaluate switch function */
1919             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1920             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1921             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1922
1923             fscal            = felec;
1924
1925             fscal            = _mm_and_ps(fscal,cutoff_mask);
1926
1927              /* Update vectorial force */
1928             fix1             = _mm_macc_ps(dx11,fscal,fix1);
1929             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
1930             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
1931
1932             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
1933             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
1934             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
1935
1936             }
1937
1938             /**************************
1939              * CALCULATE INTERACTIONS *
1940              **************************/
1941
1942             if (gmx_mm_any_lt(rsq12,rcutoff2))
1943             {
1944
1945             r12              = _mm_mul_ps(rsq12,rinv12);
1946
1947             /* EWALD ELECTROSTATICS */
1948
1949             /* Analytical PME correction */
1950             zeta2            = _mm_mul_ps(beta2,rsq12);
1951             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
1952             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
1953             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
1954             felec            = _mm_mul_ps(qq12,felec);
1955             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
1956             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
1957             velec            = _mm_mul_ps(qq12,velec);
1958
1959             d                = _mm_sub_ps(r12,rswitch);
1960             d                = _mm_max_ps(d,_mm_setzero_ps());
1961             d2               = _mm_mul_ps(d,d);
1962             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
1963
1964             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
1965
1966             /* Evaluate switch function */
1967             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1968             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1969             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1970
1971             fscal            = felec;
1972
1973             fscal            = _mm_and_ps(fscal,cutoff_mask);
1974
1975              /* Update vectorial force */
1976             fix1             = _mm_macc_ps(dx12,fscal,fix1);
1977             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
1978             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
1979
1980             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
1981             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
1982             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
1983
1984             }
1985
1986             /**************************
1987              * CALCULATE INTERACTIONS *
1988              **************************/
1989
1990             if (gmx_mm_any_lt(rsq20,rcutoff2))
1991             {
1992
1993             r20              = _mm_mul_ps(rsq20,rinv20);
1994
1995             /* EWALD ELECTROSTATICS */
1996
1997             /* Analytical PME correction */
1998             zeta2            = _mm_mul_ps(beta2,rsq20);
1999             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
2000             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2001             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2002             felec            = _mm_mul_ps(qq20,felec);
2003             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2004             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
2005             velec            = _mm_mul_ps(qq20,velec);
2006
2007             d                = _mm_sub_ps(r20,rswitch);
2008             d                = _mm_max_ps(d,_mm_setzero_ps());
2009             d2               = _mm_mul_ps(d,d);
2010             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2011
2012             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2013
2014             /* Evaluate switch function */
2015             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2016             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2017             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2018
2019             fscal            = felec;
2020
2021             fscal            = _mm_and_ps(fscal,cutoff_mask);
2022
2023              /* Update vectorial force */
2024             fix2             = _mm_macc_ps(dx20,fscal,fix2);
2025             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
2026             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
2027
2028             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
2029             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
2030             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
2031
2032             }
2033
2034             /**************************
2035              * CALCULATE INTERACTIONS *
2036              **************************/
2037
2038             if (gmx_mm_any_lt(rsq21,rcutoff2))
2039             {
2040
2041             r21              = _mm_mul_ps(rsq21,rinv21);
2042
2043             /* EWALD ELECTROSTATICS */
2044
2045             /* Analytical PME correction */
2046             zeta2            = _mm_mul_ps(beta2,rsq21);
2047             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
2048             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2049             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2050             felec            = _mm_mul_ps(qq21,felec);
2051             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2052             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2053             velec            = _mm_mul_ps(qq21,velec);
2054
2055             d                = _mm_sub_ps(r21,rswitch);
2056             d                = _mm_max_ps(d,_mm_setzero_ps());
2057             d2               = _mm_mul_ps(d,d);
2058             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2059
2060             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2061
2062             /* Evaluate switch function */
2063             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2064             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2065             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2066
2067             fscal            = felec;
2068
2069             fscal            = _mm_and_ps(fscal,cutoff_mask);
2070
2071              /* Update vectorial force */
2072             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2073             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2074             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2075
2076             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2077             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2078             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2079
2080             }
2081
2082             /**************************
2083              * CALCULATE INTERACTIONS *
2084              **************************/
2085
2086             if (gmx_mm_any_lt(rsq22,rcutoff2))
2087             {
2088
2089             r22              = _mm_mul_ps(rsq22,rinv22);
2090
2091             /* EWALD ELECTROSTATICS */
2092
2093             /* Analytical PME correction */
2094             zeta2            = _mm_mul_ps(beta2,rsq22);
2095             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
2096             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2097             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2098             felec            = _mm_mul_ps(qq22,felec);
2099             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2100             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2101             velec            = _mm_mul_ps(qq22,velec);
2102
2103             d                = _mm_sub_ps(r22,rswitch);
2104             d                = _mm_max_ps(d,_mm_setzero_ps());
2105             d2               = _mm_mul_ps(d,d);
2106             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2107
2108             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2109
2110             /* Evaluate switch function */
2111             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2112             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2113             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2114
2115             fscal            = felec;
2116
2117             fscal            = _mm_and_ps(fscal,cutoff_mask);
2118
2119              /* Update vectorial force */
2120             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2121             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2122             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2123
2124             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2125             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2126             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2127
2128             }
2129
2130             fjptrA             = f+j_coord_offsetA;
2131             fjptrB             = f+j_coord_offsetB;
2132             fjptrC             = f+j_coord_offsetC;
2133             fjptrD             = f+j_coord_offsetD;
2134
2135             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2136                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2137
2138             /* Inner loop uses 450 flops */
2139         }
2140
2141         if(jidx<j_index_end)
2142         {
2143
2144             /* Get j neighbor index, and coordinate index */
2145             jnrlistA         = jjnr[jidx];
2146             jnrlistB         = jjnr[jidx+1];
2147             jnrlistC         = jjnr[jidx+2];
2148             jnrlistD         = jjnr[jidx+3];
2149             /* Sign of each element will be negative for non-real atoms.
2150              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2151              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2152              */
2153             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2154             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2155             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2156             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2157             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2158             j_coord_offsetA  = DIM*jnrA;
2159             j_coord_offsetB  = DIM*jnrB;
2160             j_coord_offsetC  = DIM*jnrC;
2161             j_coord_offsetD  = DIM*jnrD;
2162
2163             /* load j atom coordinates */
2164             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2165                                               x+j_coord_offsetC,x+j_coord_offsetD,
2166                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2167
2168             /* Calculate displacement vector */
2169             dx00             = _mm_sub_ps(ix0,jx0);
2170             dy00             = _mm_sub_ps(iy0,jy0);
2171             dz00             = _mm_sub_ps(iz0,jz0);
2172             dx01             = _mm_sub_ps(ix0,jx1);
2173             dy01             = _mm_sub_ps(iy0,jy1);
2174             dz01             = _mm_sub_ps(iz0,jz1);
2175             dx02             = _mm_sub_ps(ix0,jx2);
2176             dy02             = _mm_sub_ps(iy0,jy2);
2177             dz02             = _mm_sub_ps(iz0,jz2);
2178             dx10             = _mm_sub_ps(ix1,jx0);
2179             dy10             = _mm_sub_ps(iy1,jy0);
2180             dz10             = _mm_sub_ps(iz1,jz0);
2181             dx11             = _mm_sub_ps(ix1,jx1);
2182             dy11             = _mm_sub_ps(iy1,jy1);
2183             dz11             = _mm_sub_ps(iz1,jz1);
2184             dx12             = _mm_sub_ps(ix1,jx2);
2185             dy12             = _mm_sub_ps(iy1,jy2);
2186             dz12             = _mm_sub_ps(iz1,jz2);
2187             dx20             = _mm_sub_ps(ix2,jx0);
2188             dy20             = _mm_sub_ps(iy2,jy0);
2189             dz20             = _mm_sub_ps(iz2,jz0);
2190             dx21             = _mm_sub_ps(ix2,jx1);
2191             dy21             = _mm_sub_ps(iy2,jy1);
2192             dz21             = _mm_sub_ps(iz2,jz1);
2193             dx22             = _mm_sub_ps(ix2,jx2);
2194             dy22             = _mm_sub_ps(iy2,jy2);
2195             dz22             = _mm_sub_ps(iz2,jz2);
2196
2197             /* Calculate squared distance and things based on it */
2198             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2199             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2200             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2201             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2202             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2203             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2204             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2205             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2206             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2207
2208             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2209             rinv01           = gmx_mm_invsqrt_ps(rsq01);
2210             rinv02           = gmx_mm_invsqrt_ps(rsq02);
2211             rinv10           = gmx_mm_invsqrt_ps(rsq10);
2212             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2213             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2214             rinv20           = gmx_mm_invsqrt_ps(rsq20);
2215             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2216             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2217
2218             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2219             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
2220             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
2221             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
2222             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2223             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2224             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
2225             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2226             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2227
2228             fjx0             = _mm_setzero_ps();
2229             fjy0             = _mm_setzero_ps();
2230             fjz0             = _mm_setzero_ps();
2231             fjx1             = _mm_setzero_ps();
2232             fjy1             = _mm_setzero_ps();
2233             fjz1             = _mm_setzero_ps();
2234             fjx2             = _mm_setzero_ps();
2235             fjy2             = _mm_setzero_ps();
2236             fjz2             = _mm_setzero_ps();
2237
2238             /**************************
2239              * CALCULATE INTERACTIONS *
2240              **************************/
2241
2242             if (gmx_mm_any_lt(rsq00,rcutoff2))
2243             {
2244
2245             r00              = _mm_mul_ps(rsq00,rinv00);
2246             r00              = _mm_andnot_ps(dummy_mask,r00);
2247
2248             /* EWALD ELECTROSTATICS */
2249
2250             /* Analytical PME correction */
2251             zeta2            = _mm_mul_ps(beta2,rsq00);
2252             rinv3            = _mm_mul_ps(rinvsq00,rinv00);
2253             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2254             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2255             felec            = _mm_mul_ps(qq00,felec);
2256             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2257             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv00);
2258             velec            = _mm_mul_ps(qq00,velec);
2259
2260             d                = _mm_sub_ps(r00,rswitch);
2261             d                = _mm_max_ps(d,_mm_setzero_ps());
2262             d2               = _mm_mul_ps(d,d);
2263             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2264
2265             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2266
2267             /* Evaluate switch function */
2268             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2269             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2270             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2271
2272             fscal            = felec;
2273
2274             fscal            = _mm_and_ps(fscal,cutoff_mask);
2275
2276             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2277
2278              /* Update vectorial force */
2279             fix0             = _mm_macc_ps(dx00,fscal,fix0);
2280             fiy0             = _mm_macc_ps(dy00,fscal,fiy0);
2281             fiz0             = _mm_macc_ps(dz00,fscal,fiz0);
2282
2283             fjx0             = _mm_macc_ps(dx00,fscal,fjx0);
2284             fjy0             = _mm_macc_ps(dy00,fscal,fjy0);
2285             fjz0             = _mm_macc_ps(dz00,fscal,fjz0);
2286
2287             }
2288
2289             /**************************
2290              * CALCULATE INTERACTIONS *
2291              **************************/
2292
2293             if (gmx_mm_any_lt(rsq01,rcutoff2))
2294             {
2295
2296             r01              = _mm_mul_ps(rsq01,rinv01);
2297             r01              = _mm_andnot_ps(dummy_mask,r01);
2298
2299             /* EWALD ELECTROSTATICS */
2300
2301             /* Analytical PME correction */
2302             zeta2            = _mm_mul_ps(beta2,rsq01);
2303             rinv3            = _mm_mul_ps(rinvsq01,rinv01);
2304             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2305             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2306             felec            = _mm_mul_ps(qq01,felec);
2307             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2308             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv01);
2309             velec            = _mm_mul_ps(qq01,velec);
2310
2311             d                = _mm_sub_ps(r01,rswitch);
2312             d                = _mm_max_ps(d,_mm_setzero_ps());
2313             d2               = _mm_mul_ps(d,d);
2314             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2315
2316             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2317
2318             /* Evaluate switch function */
2319             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2320             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2321             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
2322
2323             fscal            = felec;
2324
2325             fscal            = _mm_and_ps(fscal,cutoff_mask);
2326
2327             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2328
2329              /* Update vectorial force */
2330             fix0             = _mm_macc_ps(dx01,fscal,fix0);
2331             fiy0             = _mm_macc_ps(dy01,fscal,fiy0);
2332             fiz0             = _mm_macc_ps(dz01,fscal,fiz0);
2333
2334             fjx1             = _mm_macc_ps(dx01,fscal,fjx1);
2335             fjy1             = _mm_macc_ps(dy01,fscal,fjy1);
2336             fjz1             = _mm_macc_ps(dz01,fscal,fjz1);
2337
2338             }
2339
2340             /**************************
2341              * CALCULATE INTERACTIONS *
2342              **************************/
2343
2344             if (gmx_mm_any_lt(rsq02,rcutoff2))
2345             {
2346
2347             r02              = _mm_mul_ps(rsq02,rinv02);
2348             r02              = _mm_andnot_ps(dummy_mask,r02);
2349
2350             /* EWALD ELECTROSTATICS */
2351
2352             /* Analytical PME correction */
2353             zeta2            = _mm_mul_ps(beta2,rsq02);
2354             rinv3            = _mm_mul_ps(rinvsq02,rinv02);
2355             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2356             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2357             felec            = _mm_mul_ps(qq02,felec);
2358             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2359             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv02);
2360             velec            = _mm_mul_ps(qq02,velec);
2361
2362             d                = _mm_sub_ps(r02,rswitch);
2363             d                = _mm_max_ps(d,_mm_setzero_ps());
2364             d2               = _mm_mul_ps(d,d);
2365             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2366
2367             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2368
2369             /* Evaluate switch function */
2370             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2371             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2372             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
2373
2374             fscal            = felec;
2375
2376             fscal            = _mm_and_ps(fscal,cutoff_mask);
2377
2378             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2379
2380              /* Update vectorial force */
2381             fix0             = _mm_macc_ps(dx02,fscal,fix0);
2382             fiy0             = _mm_macc_ps(dy02,fscal,fiy0);
2383             fiz0             = _mm_macc_ps(dz02,fscal,fiz0);
2384
2385             fjx2             = _mm_macc_ps(dx02,fscal,fjx2);
2386             fjy2             = _mm_macc_ps(dy02,fscal,fjy2);
2387             fjz2             = _mm_macc_ps(dz02,fscal,fjz2);
2388
2389             }
2390
2391             /**************************
2392              * CALCULATE INTERACTIONS *
2393              **************************/
2394
2395             if (gmx_mm_any_lt(rsq10,rcutoff2))
2396             {
2397
2398             r10              = _mm_mul_ps(rsq10,rinv10);
2399             r10              = _mm_andnot_ps(dummy_mask,r10);
2400
2401             /* EWALD ELECTROSTATICS */
2402
2403             /* Analytical PME correction */
2404             zeta2            = _mm_mul_ps(beta2,rsq10);
2405             rinv3            = _mm_mul_ps(rinvsq10,rinv10);
2406             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2407             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2408             felec            = _mm_mul_ps(qq10,felec);
2409             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2410             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv10);
2411             velec            = _mm_mul_ps(qq10,velec);
2412
2413             d                = _mm_sub_ps(r10,rswitch);
2414             d                = _mm_max_ps(d,_mm_setzero_ps());
2415             d2               = _mm_mul_ps(d,d);
2416             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2417
2418             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2419
2420             /* Evaluate switch function */
2421             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2422             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2423             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
2424
2425             fscal            = felec;
2426
2427             fscal            = _mm_and_ps(fscal,cutoff_mask);
2428
2429             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2430
2431              /* Update vectorial force */
2432             fix1             = _mm_macc_ps(dx10,fscal,fix1);
2433             fiy1             = _mm_macc_ps(dy10,fscal,fiy1);
2434             fiz1             = _mm_macc_ps(dz10,fscal,fiz1);
2435
2436             fjx0             = _mm_macc_ps(dx10,fscal,fjx0);
2437             fjy0             = _mm_macc_ps(dy10,fscal,fjy0);
2438             fjz0             = _mm_macc_ps(dz10,fscal,fjz0);
2439
2440             }
2441
2442             /**************************
2443              * CALCULATE INTERACTIONS *
2444              **************************/
2445
2446             if (gmx_mm_any_lt(rsq11,rcutoff2))
2447             {
2448
2449             r11              = _mm_mul_ps(rsq11,rinv11);
2450             r11              = _mm_andnot_ps(dummy_mask,r11);
2451
2452             /* EWALD ELECTROSTATICS */
2453
2454             /* Analytical PME correction */
2455             zeta2            = _mm_mul_ps(beta2,rsq11);
2456             rinv3            = _mm_mul_ps(rinvsq11,rinv11);
2457             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2458             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2459             felec            = _mm_mul_ps(qq11,felec);
2460             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2461             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv11);
2462             velec            = _mm_mul_ps(qq11,velec);
2463
2464             d                = _mm_sub_ps(r11,rswitch);
2465             d                = _mm_max_ps(d,_mm_setzero_ps());
2466             d2               = _mm_mul_ps(d,d);
2467             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2468
2469             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2470
2471             /* Evaluate switch function */
2472             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2473             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2474             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2475
2476             fscal            = felec;
2477
2478             fscal            = _mm_and_ps(fscal,cutoff_mask);
2479
2480             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2481
2482              /* Update vectorial force */
2483             fix1             = _mm_macc_ps(dx11,fscal,fix1);
2484             fiy1             = _mm_macc_ps(dy11,fscal,fiy1);
2485             fiz1             = _mm_macc_ps(dz11,fscal,fiz1);
2486
2487             fjx1             = _mm_macc_ps(dx11,fscal,fjx1);
2488             fjy1             = _mm_macc_ps(dy11,fscal,fjy1);
2489             fjz1             = _mm_macc_ps(dz11,fscal,fjz1);
2490
2491             }
2492
2493             /**************************
2494              * CALCULATE INTERACTIONS *
2495              **************************/
2496
2497             if (gmx_mm_any_lt(rsq12,rcutoff2))
2498             {
2499
2500             r12              = _mm_mul_ps(rsq12,rinv12);
2501             r12              = _mm_andnot_ps(dummy_mask,r12);
2502
2503             /* EWALD ELECTROSTATICS */
2504
2505             /* Analytical PME correction */
2506             zeta2            = _mm_mul_ps(beta2,rsq12);
2507             rinv3            = _mm_mul_ps(rinvsq12,rinv12);
2508             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2509             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2510             felec            = _mm_mul_ps(qq12,felec);
2511             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2512             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv12);
2513             velec            = _mm_mul_ps(qq12,velec);
2514
2515             d                = _mm_sub_ps(r12,rswitch);
2516             d                = _mm_max_ps(d,_mm_setzero_ps());
2517             d2               = _mm_mul_ps(d,d);
2518             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2519
2520             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2521
2522             /* Evaluate switch function */
2523             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2524             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2525             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2526
2527             fscal            = felec;
2528
2529             fscal            = _mm_and_ps(fscal,cutoff_mask);
2530
2531             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2532
2533              /* Update vectorial force */
2534             fix1             = _mm_macc_ps(dx12,fscal,fix1);
2535             fiy1             = _mm_macc_ps(dy12,fscal,fiy1);
2536             fiz1             = _mm_macc_ps(dz12,fscal,fiz1);
2537
2538             fjx2             = _mm_macc_ps(dx12,fscal,fjx2);
2539             fjy2             = _mm_macc_ps(dy12,fscal,fjy2);
2540             fjz2             = _mm_macc_ps(dz12,fscal,fjz2);
2541
2542             }
2543
2544             /**************************
2545              * CALCULATE INTERACTIONS *
2546              **************************/
2547
2548             if (gmx_mm_any_lt(rsq20,rcutoff2))
2549             {
2550
2551             r20              = _mm_mul_ps(rsq20,rinv20);
2552             r20              = _mm_andnot_ps(dummy_mask,r20);
2553
2554             /* EWALD ELECTROSTATICS */
2555
2556             /* Analytical PME correction */
2557             zeta2            = _mm_mul_ps(beta2,rsq20);
2558             rinv3            = _mm_mul_ps(rinvsq20,rinv20);
2559             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2560             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2561             felec            = _mm_mul_ps(qq20,felec);
2562             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2563             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv20);
2564             velec            = _mm_mul_ps(qq20,velec);
2565
2566             d                = _mm_sub_ps(r20,rswitch);
2567             d                = _mm_max_ps(d,_mm_setzero_ps());
2568             d2               = _mm_mul_ps(d,d);
2569             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2570
2571             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2572
2573             /* Evaluate switch function */
2574             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2575             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2576             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2577
2578             fscal            = felec;
2579
2580             fscal            = _mm_and_ps(fscal,cutoff_mask);
2581
2582             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2583
2584              /* Update vectorial force */
2585             fix2             = _mm_macc_ps(dx20,fscal,fix2);
2586             fiy2             = _mm_macc_ps(dy20,fscal,fiy2);
2587             fiz2             = _mm_macc_ps(dz20,fscal,fiz2);
2588
2589             fjx0             = _mm_macc_ps(dx20,fscal,fjx0);
2590             fjy0             = _mm_macc_ps(dy20,fscal,fjy0);
2591             fjz0             = _mm_macc_ps(dz20,fscal,fjz0);
2592
2593             }
2594
2595             /**************************
2596              * CALCULATE INTERACTIONS *
2597              **************************/
2598
2599             if (gmx_mm_any_lt(rsq21,rcutoff2))
2600             {
2601
2602             r21              = _mm_mul_ps(rsq21,rinv21);
2603             r21              = _mm_andnot_ps(dummy_mask,r21);
2604
2605             /* EWALD ELECTROSTATICS */
2606
2607             /* Analytical PME correction */
2608             zeta2            = _mm_mul_ps(beta2,rsq21);
2609             rinv3            = _mm_mul_ps(rinvsq21,rinv21);
2610             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2611             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2612             felec            = _mm_mul_ps(qq21,felec);
2613             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2614             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv21);
2615             velec            = _mm_mul_ps(qq21,velec);
2616
2617             d                = _mm_sub_ps(r21,rswitch);
2618             d                = _mm_max_ps(d,_mm_setzero_ps());
2619             d2               = _mm_mul_ps(d,d);
2620             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2621
2622             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2623
2624             /* Evaluate switch function */
2625             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2626             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2627             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2628
2629             fscal            = felec;
2630
2631             fscal            = _mm_and_ps(fscal,cutoff_mask);
2632
2633             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2634
2635              /* Update vectorial force */
2636             fix2             = _mm_macc_ps(dx21,fscal,fix2);
2637             fiy2             = _mm_macc_ps(dy21,fscal,fiy2);
2638             fiz2             = _mm_macc_ps(dz21,fscal,fiz2);
2639
2640             fjx1             = _mm_macc_ps(dx21,fscal,fjx1);
2641             fjy1             = _mm_macc_ps(dy21,fscal,fjy1);
2642             fjz1             = _mm_macc_ps(dz21,fscal,fjz1);
2643
2644             }
2645
2646             /**************************
2647              * CALCULATE INTERACTIONS *
2648              **************************/
2649
2650             if (gmx_mm_any_lt(rsq22,rcutoff2))
2651             {
2652
2653             r22              = _mm_mul_ps(rsq22,rinv22);
2654             r22              = _mm_andnot_ps(dummy_mask,r22);
2655
2656             /* EWALD ELECTROSTATICS */
2657
2658             /* Analytical PME correction */
2659             zeta2            = _mm_mul_ps(beta2,rsq22);
2660             rinv3            = _mm_mul_ps(rinvsq22,rinv22);
2661             pmecorrF         = gmx_mm_pmecorrF_ps(zeta2);
2662             felec            = _mm_macc_ps(pmecorrF,beta3,rinv3);
2663             felec            = _mm_mul_ps(qq22,felec);
2664             pmecorrV         = gmx_mm_pmecorrV_ps(zeta2);
2665             velec            = _mm_nmacc_ps(pmecorrV,beta,rinv22);
2666             velec            = _mm_mul_ps(qq22,velec);
2667
2668             d                = _mm_sub_ps(r22,rswitch);
2669             d                = _mm_max_ps(d,_mm_setzero_ps());
2670             d2               = _mm_mul_ps(d,d);
2671             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_macc_ps(d,_mm_macc_ps(d,swV5,swV4),swV3))));
2672
2673             dsw              = _mm_mul_ps(d2,_mm_macc_ps(d,_mm_macc_ps(d,swF4,swF3),swF2));
2674
2675             /* Evaluate switch function */
2676             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2677             felec            = _mm_msub_ps( felec,sw , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2678             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2679
2680             fscal            = felec;
2681
2682             fscal            = _mm_and_ps(fscal,cutoff_mask);
2683
2684             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2685
2686              /* Update vectorial force */
2687             fix2             = _mm_macc_ps(dx22,fscal,fix2);
2688             fiy2             = _mm_macc_ps(dy22,fscal,fiy2);
2689             fiz2             = _mm_macc_ps(dz22,fscal,fiz2);
2690
2691             fjx2             = _mm_macc_ps(dx22,fscal,fjx2);
2692             fjy2             = _mm_macc_ps(dy22,fscal,fjy2);
2693             fjz2             = _mm_macc_ps(dz22,fscal,fjz2);
2694
2695             }
2696
2697             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2698             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2699             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2700             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2701
2702             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2703                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2704
2705             /* Inner loop uses 459 flops */
2706         }
2707
2708         /* End of innermost loop */
2709
2710         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2711                                               f+i_coord_offset,fshift+i_shift_offset);
2712
2713         /* Increment number of inner iterations */
2714         inneriter                  += j_index_end - j_index_start;
2715
2716         /* Outer loop uses 18 flops */
2717     }
2718
2719     /* Increment number of outer iterations */
2720     outeriter        += nri;
2721
2722     /* Update outer/inner flops */
2723
2724     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*459);
2725 }