Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse4_1_single / nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_sse4_1_single.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse4_1_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_sse4_1_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse4_1_single
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse4_1_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80     real             scratch[4*DIM];
81     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
104     real             *charge;
105     int              nvdwtype;
106     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
107     int              *vdwtype;
108     real             *vdwparam;
109     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
110     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
111     __m128i          ewitab;
112     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
113     real             *ewtab;
114     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
115     real             rswitch_scalar,d_scalar;
116     __m128           dummy_mask,cutoff_mask;
117     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
118     __m128           one     = _mm_set1_ps(1.0);
119     __m128           two     = _mm_set1_ps(2.0);
120     x                = xx[0];
121     f                = ff[0];
122
123     nri              = nlist->nri;
124     iinr             = nlist->iinr;
125     jindex           = nlist->jindex;
126     jjnr             = nlist->jjnr;
127     shiftidx         = nlist->shift;
128     gid              = nlist->gid;
129     shiftvec         = fr->shift_vec[0];
130     fshift           = fr->fshift[0];
131     facel            = _mm_set1_ps(fr->ic->epsfac);
132     charge           = mdatoms->chargeA;
133     nvdwtype         = fr->ntype;
134     vdwparam         = fr->nbfp;
135     vdwtype          = mdatoms->typeA;
136
137     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
138     ewtab            = fr->ic->tabq_coul_FDV0;
139     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
140     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
141
142     /* Setup water-specific parameters */
143     inr              = nlist->iinr[0];
144     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
145     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
146     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
147     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
148
149     jq0              = _mm_set1_ps(charge[inr+0]);
150     jq1              = _mm_set1_ps(charge[inr+1]);
151     jq2              = _mm_set1_ps(charge[inr+2]);
152     vdwjidx0A        = 2*vdwtype[inr+0];
153     qq00             = _mm_mul_ps(iq0,jq0);
154     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
155     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
156     qq01             = _mm_mul_ps(iq0,jq1);
157     qq02             = _mm_mul_ps(iq0,jq2);
158     qq10             = _mm_mul_ps(iq1,jq0);
159     qq11             = _mm_mul_ps(iq1,jq1);
160     qq12             = _mm_mul_ps(iq1,jq2);
161     qq20             = _mm_mul_ps(iq2,jq0);
162     qq21             = _mm_mul_ps(iq2,jq1);
163     qq22             = _mm_mul_ps(iq2,jq2);
164
165     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
166     rcutoff_scalar   = fr->ic->rcoulomb;
167     rcutoff          = _mm_set1_ps(rcutoff_scalar);
168     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
169
170     rswitch_scalar   = fr->ic->rcoulomb_switch;
171     rswitch          = _mm_set1_ps(rswitch_scalar);
172     /* Setup switch parameters */
173     d_scalar         = rcutoff_scalar-rswitch_scalar;
174     d                = _mm_set1_ps(d_scalar);
175     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
176     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
177     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
178     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
179     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
180     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
181
182     /* Avoid stupid compiler warnings */
183     jnrA = jnrB = jnrC = jnrD = 0;
184     j_coord_offsetA = 0;
185     j_coord_offsetB = 0;
186     j_coord_offsetC = 0;
187     j_coord_offsetD = 0;
188
189     outeriter        = 0;
190     inneriter        = 0;
191
192     for(iidx=0;iidx<4*DIM;iidx++)
193     {
194         scratch[iidx] = 0.0;
195     }
196
197     /* Start outer loop over neighborlists */
198     for(iidx=0; iidx<nri; iidx++)
199     {
200         /* Load shift vector for this list */
201         i_shift_offset   = DIM*shiftidx[iidx];
202
203         /* Load limits for loop over neighbors */
204         j_index_start    = jindex[iidx];
205         j_index_end      = jindex[iidx+1];
206
207         /* Get outer coordinate index */
208         inr              = iinr[iidx];
209         i_coord_offset   = DIM*inr;
210
211         /* Load i particle coords and add shift vector */
212         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
213                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
214
215         fix0             = _mm_setzero_ps();
216         fiy0             = _mm_setzero_ps();
217         fiz0             = _mm_setzero_ps();
218         fix1             = _mm_setzero_ps();
219         fiy1             = _mm_setzero_ps();
220         fiz1             = _mm_setzero_ps();
221         fix2             = _mm_setzero_ps();
222         fiy2             = _mm_setzero_ps();
223         fiz2             = _mm_setzero_ps();
224
225         /* Reset potential sums */
226         velecsum         = _mm_setzero_ps();
227         vvdwsum          = _mm_setzero_ps();
228
229         /* Start inner kernel loop */
230         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
231         {
232
233             /* Get j neighbor index, and coordinate index */
234             jnrA             = jjnr[jidx];
235             jnrB             = jjnr[jidx+1];
236             jnrC             = jjnr[jidx+2];
237             jnrD             = jjnr[jidx+3];
238             j_coord_offsetA  = DIM*jnrA;
239             j_coord_offsetB  = DIM*jnrB;
240             j_coord_offsetC  = DIM*jnrC;
241             j_coord_offsetD  = DIM*jnrD;
242
243             /* load j atom coordinates */
244             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
245                                               x+j_coord_offsetC,x+j_coord_offsetD,
246                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
247
248             /* Calculate displacement vector */
249             dx00             = _mm_sub_ps(ix0,jx0);
250             dy00             = _mm_sub_ps(iy0,jy0);
251             dz00             = _mm_sub_ps(iz0,jz0);
252             dx01             = _mm_sub_ps(ix0,jx1);
253             dy01             = _mm_sub_ps(iy0,jy1);
254             dz01             = _mm_sub_ps(iz0,jz1);
255             dx02             = _mm_sub_ps(ix0,jx2);
256             dy02             = _mm_sub_ps(iy0,jy2);
257             dz02             = _mm_sub_ps(iz0,jz2);
258             dx10             = _mm_sub_ps(ix1,jx0);
259             dy10             = _mm_sub_ps(iy1,jy0);
260             dz10             = _mm_sub_ps(iz1,jz0);
261             dx11             = _mm_sub_ps(ix1,jx1);
262             dy11             = _mm_sub_ps(iy1,jy1);
263             dz11             = _mm_sub_ps(iz1,jz1);
264             dx12             = _mm_sub_ps(ix1,jx2);
265             dy12             = _mm_sub_ps(iy1,jy2);
266             dz12             = _mm_sub_ps(iz1,jz2);
267             dx20             = _mm_sub_ps(ix2,jx0);
268             dy20             = _mm_sub_ps(iy2,jy0);
269             dz20             = _mm_sub_ps(iz2,jz0);
270             dx21             = _mm_sub_ps(ix2,jx1);
271             dy21             = _mm_sub_ps(iy2,jy1);
272             dz21             = _mm_sub_ps(iz2,jz1);
273             dx22             = _mm_sub_ps(ix2,jx2);
274             dy22             = _mm_sub_ps(iy2,jy2);
275             dz22             = _mm_sub_ps(iz2,jz2);
276
277             /* Calculate squared distance and things based on it */
278             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
279             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
280             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
281             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
282             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
283             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
284             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
285             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
286             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
287
288             rinv00           = sse41_invsqrt_f(rsq00);
289             rinv01           = sse41_invsqrt_f(rsq01);
290             rinv02           = sse41_invsqrt_f(rsq02);
291             rinv10           = sse41_invsqrt_f(rsq10);
292             rinv11           = sse41_invsqrt_f(rsq11);
293             rinv12           = sse41_invsqrt_f(rsq12);
294             rinv20           = sse41_invsqrt_f(rsq20);
295             rinv21           = sse41_invsqrt_f(rsq21);
296             rinv22           = sse41_invsqrt_f(rsq22);
297
298             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
299             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
300             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
301             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
302             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
303             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
304             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
305             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
306             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
307
308             fjx0             = _mm_setzero_ps();
309             fjy0             = _mm_setzero_ps();
310             fjz0             = _mm_setzero_ps();
311             fjx1             = _mm_setzero_ps();
312             fjy1             = _mm_setzero_ps();
313             fjz1             = _mm_setzero_ps();
314             fjx2             = _mm_setzero_ps();
315             fjy2             = _mm_setzero_ps();
316             fjz2             = _mm_setzero_ps();
317
318             /**************************
319              * CALCULATE INTERACTIONS *
320              **************************/
321
322             if (gmx_mm_any_lt(rsq00,rcutoff2))
323             {
324
325             r00              = _mm_mul_ps(rsq00,rinv00);
326
327             /* EWALD ELECTROSTATICS */
328
329             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
330             ewrt             = _mm_mul_ps(r00,ewtabscale);
331             ewitab           = _mm_cvttps_epi32(ewrt);
332             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
333             ewitab           = _mm_slli_epi32(ewitab,2);
334             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
335             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
336             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
337             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
338             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
339             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
340             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
341             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
342             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
343
344             /* LENNARD-JONES DISPERSION/REPULSION */
345
346             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
347             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
348             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
349             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
350             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
351
352             d                = _mm_sub_ps(r00,rswitch);
353             d                = _mm_max_ps(d,_mm_setzero_ps());
354             d2               = _mm_mul_ps(d,d);
355             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
356
357             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
358
359             /* Evaluate switch function */
360             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
361             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
362             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
363             velec            = _mm_mul_ps(velec,sw);
364             vvdw             = _mm_mul_ps(vvdw,sw);
365             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
366
367             /* Update potential sum for this i atom from the interaction with this j atom. */
368             velec            = _mm_and_ps(velec,cutoff_mask);
369             velecsum         = _mm_add_ps(velecsum,velec);
370             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
371             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
372
373             fscal            = _mm_add_ps(felec,fvdw);
374
375             fscal            = _mm_and_ps(fscal,cutoff_mask);
376
377             /* Calculate temporary vectorial force */
378             tx               = _mm_mul_ps(fscal,dx00);
379             ty               = _mm_mul_ps(fscal,dy00);
380             tz               = _mm_mul_ps(fscal,dz00);
381
382             /* Update vectorial force */
383             fix0             = _mm_add_ps(fix0,tx);
384             fiy0             = _mm_add_ps(fiy0,ty);
385             fiz0             = _mm_add_ps(fiz0,tz);
386
387             fjx0             = _mm_add_ps(fjx0,tx);
388             fjy0             = _mm_add_ps(fjy0,ty);
389             fjz0             = _mm_add_ps(fjz0,tz);
390
391             }
392
393             /**************************
394              * CALCULATE INTERACTIONS *
395              **************************/
396
397             if (gmx_mm_any_lt(rsq01,rcutoff2))
398             {
399
400             r01              = _mm_mul_ps(rsq01,rinv01);
401
402             /* EWALD ELECTROSTATICS */
403
404             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
405             ewrt             = _mm_mul_ps(r01,ewtabscale);
406             ewitab           = _mm_cvttps_epi32(ewrt);
407             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
408             ewitab           = _mm_slli_epi32(ewitab,2);
409             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
410             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
411             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
412             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
413             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
414             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
415             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
416             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
417             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
418
419             d                = _mm_sub_ps(r01,rswitch);
420             d                = _mm_max_ps(d,_mm_setzero_ps());
421             d2               = _mm_mul_ps(d,d);
422             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
423
424             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
425
426             /* Evaluate switch function */
427             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
428             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
429             velec            = _mm_mul_ps(velec,sw);
430             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
431
432             /* Update potential sum for this i atom from the interaction with this j atom. */
433             velec            = _mm_and_ps(velec,cutoff_mask);
434             velecsum         = _mm_add_ps(velecsum,velec);
435
436             fscal            = felec;
437
438             fscal            = _mm_and_ps(fscal,cutoff_mask);
439
440             /* Calculate temporary vectorial force */
441             tx               = _mm_mul_ps(fscal,dx01);
442             ty               = _mm_mul_ps(fscal,dy01);
443             tz               = _mm_mul_ps(fscal,dz01);
444
445             /* Update vectorial force */
446             fix0             = _mm_add_ps(fix0,tx);
447             fiy0             = _mm_add_ps(fiy0,ty);
448             fiz0             = _mm_add_ps(fiz0,tz);
449
450             fjx1             = _mm_add_ps(fjx1,tx);
451             fjy1             = _mm_add_ps(fjy1,ty);
452             fjz1             = _mm_add_ps(fjz1,tz);
453
454             }
455
456             /**************************
457              * CALCULATE INTERACTIONS *
458              **************************/
459
460             if (gmx_mm_any_lt(rsq02,rcutoff2))
461             {
462
463             r02              = _mm_mul_ps(rsq02,rinv02);
464
465             /* EWALD ELECTROSTATICS */
466
467             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
468             ewrt             = _mm_mul_ps(r02,ewtabscale);
469             ewitab           = _mm_cvttps_epi32(ewrt);
470             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
471             ewitab           = _mm_slli_epi32(ewitab,2);
472             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
473             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
474             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
475             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
476             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
477             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
478             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
479             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
480             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
481
482             d                = _mm_sub_ps(r02,rswitch);
483             d                = _mm_max_ps(d,_mm_setzero_ps());
484             d2               = _mm_mul_ps(d,d);
485             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
486
487             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
488
489             /* Evaluate switch function */
490             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
491             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
492             velec            = _mm_mul_ps(velec,sw);
493             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
494
495             /* Update potential sum for this i atom from the interaction with this j atom. */
496             velec            = _mm_and_ps(velec,cutoff_mask);
497             velecsum         = _mm_add_ps(velecsum,velec);
498
499             fscal            = felec;
500
501             fscal            = _mm_and_ps(fscal,cutoff_mask);
502
503             /* Calculate temporary vectorial force */
504             tx               = _mm_mul_ps(fscal,dx02);
505             ty               = _mm_mul_ps(fscal,dy02);
506             tz               = _mm_mul_ps(fscal,dz02);
507
508             /* Update vectorial force */
509             fix0             = _mm_add_ps(fix0,tx);
510             fiy0             = _mm_add_ps(fiy0,ty);
511             fiz0             = _mm_add_ps(fiz0,tz);
512
513             fjx2             = _mm_add_ps(fjx2,tx);
514             fjy2             = _mm_add_ps(fjy2,ty);
515             fjz2             = _mm_add_ps(fjz2,tz);
516
517             }
518
519             /**************************
520              * CALCULATE INTERACTIONS *
521              **************************/
522
523             if (gmx_mm_any_lt(rsq10,rcutoff2))
524             {
525
526             r10              = _mm_mul_ps(rsq10,rinv10);
527
528             /* EWALD ELECTROSTATICS */
529
530             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
531             ewrt             = _mm_mul_ps(r10,ewtabscale);
532             ewitab           = _mm_cvttps_epi32(ewrt);
533             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
534             ewitab           = _mm_slli_epi32(ewitab,2);
535             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
536             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
537             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
538             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
539             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
540             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
541             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
542             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
543             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
544
545             d                = _mm_sub_ps(r10,rswitch);
546             d                = _mm_max_ps(d,_mm_setzero_ps());
547             d2               = _mm_mul_ps(d,d);
548             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
549
550             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
551
552             /* Evaluate switch function */
553             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
554             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
555             velec            = _mm_mul_ps(velec,sw);
556             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
557
558             /* Update potential sum for this i atom from the interaction with this j atom. */
559             velec            = _mm_and_ps(velec,cutoff_mask);
560             velecsum         = _mm_add_ps(velecsum,velec);
561
562             fscal            = felec;
563
564             fscal            = _mm_and_ps(fscal,cutoff_mask);
565
566             /* Calculate temporary vectorial force */
567             tx               = _mm_mul_ps(fscal,dx10);
568             ty               = _mm_mul_ps(fscal,dy10);
569             tz               = _mm_mul_ps(fscal,dz10);
570
571             /* Update vectorial force */
572             fix1             = _mm_add_ps(fix1,tx);
573             fiy1             = _mm_add_ps(fiy1,ty);
574             fiz1             = _mm_add_ps(fiz1,tz);
575
576             fjx0             = _mm_add_ps(fjx0,tx);
577             fjy0             = _mm_add_ps(fjy0,ty);
578             fjz0             = _mm_add_ps(fjz0,tz);
579
580             }
581
582             /**************************
583              * CALCULATE INTERACTIONS *
584              **************************/
585
586             if (gmx_mm_any_lt(rsq11,rcutoff2))
587             {
588
589             r11              = _mm_mul_ps(rsq11,rinv11);
590
591             /* EWALD ELECTROSTATICS */
592
593             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
594             ewrt             = _mm_mul_ps(r11,ewtabscale);
595             ewitab           = _mm_cvttps_epi32(ewrt);
596             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
597             ewitab           = _mm_slli_epi32(ewitab,2);
598             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
599             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
600             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
601             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
602             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
603             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
604             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
605             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
606             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
607
608             d                = _mm_sub_ps(r11,rswitch);
609             d                = _mm_max_ps(d,_mm_setzero_ps());
610             d2               = _mm_mul_ps(d,d);
611             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
612
613             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
614
615             /* Evaluate switch function */
616             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
617             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
618             velec            = _mm_mul_ps(velec,sw);
619             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
620
621             /* Update potential sum for this i atom from the interaction with this j atom. */
622             velec            = _mm_and_ps(velec,cutoff_mask);
623             velecsum         = _mm_add_ps(velecsum,velec);
624
625             fscal            = felec;
626
627             fscal            = _mm_and_ps(fscal,cutoff_mask);
628
629             /* Calculate temporary vectorial force */
630             tx               = _mm_mul_ps(fscal,dx11);
631             ty               = _mm_mul_ps(fscal,dy11);
632             tz               = _mm_mul_ps(fscal,dz11);
633
634             /* Update vectorial force */
635             fix1             = _mm_add_ps(fix1,tx);
636             fiy1             = _mm_add_ps(fiy1,ty);
637             fiz1             = _mm_add_ps(fiz1,tz);
638
639             fjx1             = _mm_add_ps(fjx1,tx);
640             fjy1             = _mm_add_ps(fjy1,ty);
641             fjz1             = _mm_add_ps(fjz1,tz);
642
643             }
644
645             /**************************
646              * CALCULATE INTERACTIONS *
647              **************************/
648
649             if (gmx_mm_any_lt(rsq12,rcutoff2))
650             {
651
652             r12              = _mm_mul_ps(rsq12,rinv12);
653
654             /* EWALD ELECTROSTATICS */
655
656             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
657             ewrt             = _mm_mul_ps(r12,ewtabscale);
658             ewitab           = _mm_cvttps_epi32(ewrt);
659             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
660             ewitab           = _mm_slli_epi32(ewitab,2);
661             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
662             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
663             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
664             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
665             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
666             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
667             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
668             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
669             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
670
671             d                = _mm_sub_ps(r12,rswitch);
672             d                = _mm_max_ps(d,_mm_setzero_ps());
673             d2               = _mm_mul_ps(d,d);
674             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
675
676             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
677
678             /* Evaluate switch function */
679             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
680             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
681             velec            = _mm_mul_ps(velec,sw);
682             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
683
684             /* Update potential sum for this i atom from the interaction with this j atom. */
685             velec            = _mm_and_ps(velec,cutoff_mask);
686             velecsum         = _mm_add_ps(velecsum,velec);
687
688             fscal            = felec;
689
690             fscal            = _mm_and_ps(fscal,cutoff_mask);
691
692             /* Calculate temporary vectorial force */
693             tx               = _mm_mul_ps(fscal,dx12);
694             ty               = _mm_mul_ps(fscal,dy12);
695             tz               = _mm_mul_ps(fscal,dz12);
696
697             /* Update vectorial force */
698             fix1             = _mm_add_ps(fix1,tx);
699             fiy1             = _mm_add_ps(fiy1,ty);
700             fiz1             = _mm_add_ps(fiz1,tz);
701
702             fjx2             = _mm_add_ps(fjx2,tx);
703             fjy2             = _mm_add_ps(fjy2,ty);
704             fjz2             = _mm_add_ps(fjz2,tz);
705
706             }
707
708             /**************************
709              * CALCULATE INTERACTIONS *
710              **************************/
711
712             if (gmx_mm_any_lt(rsq20,rcutoff2))
713             {
714
715             r20              = _mm_mul_ps(rsq20,rinv20);
716
717             /* EWALD ELECTROSTATICS */
718
719             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
720             ewrt             = _mm_mul_ps(r20,ewtabscale);
721             ewitab           = _mm_cvttps_epi32(ewrt);
722             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
723             ewitab           = _mm_slli_epi32(ewitab,2);
724             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
725             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
726             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
727             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
728             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
729             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
730             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
731             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
732             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
733
734             d                = _mm_sub_ps(r20,rswitch);
735             d                = _mm_max_ps(d,_mm_setzero_ps());
736             d2               = _mm_mul_ps(d,d);
737             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
738
739             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
740
741             /* Evaluate switch function */
742             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
743             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
744             velec            = _mm_mul_ps(velec,sw);
745             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
746
747             /* Update potential sum for this i atom from the interaction with this j atom. */
748             velec            = _mm_and_ps(velec,cutoff_mask);
749             velecsum         = _mm_add_ps(velecsum,velec);
750
751             fscal            = felec;
752
753             fscal            = _mm_and_ps(fscal,cutoff_mask);
754
755             /* Calculate temporary vectorial force */
756             tx               = _mm_mul_ps(fscal,dx20);
757             ty               = _mm_mul_ps(fscal,dy20);
758             tz               = _mm_mul_ps(fscal,dz20);
759
760             /* Update vectorial force */
761             fix2             = _mm_add_ps(fix2,tx);
762             fiy2             = _mm_add_ps(fiy2,ty);
763             fiz2             = _mm_add_ps(fiz2,tz);
764
765             fjx0             = _mm_add_ps(fjx0,tx);
766             fjy0             = _mm_add_ps(fjy0,ty);
767             fjz0             = _mm_add_ps(fjz0,tz);
768
769             }
770
771             /**************************
772              * CALCULATE INTERACTIONS *
773              **************************/
774
775             if (gmx_mm_any_lt(rsq21,rcutoff2))
776             {
777
778             r21              = _mm_mul_ps(rsq21,rinv21);
779
780             /* EWALD ELECTROSTATICS */
781
782             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
783             ewrt             = _mm_mul_ps(r21,ewtabscale);
784             ewitab           = _mm_cvttps_epi32(ewrt);
785             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
786             ewitab           = _mm_slli_epi32(ewitab,2);
787             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
788             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
789             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
790             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
791             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
792             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
793             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
794             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
795             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
796
797             d                = _mm_sub_ps(r21,rswitch);
798             d                = _mm_max_ps(d,_mm_setzero_ps());
799             d2               = _mm_mul_ps(d,d);
800             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
801
802             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
803
804             /* Evaluate switch function */
805             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
806             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
807             velec            = _mm_mul_ps(velec,sw);
808             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
809
810             /* Update potential sum for this i atom from the interaction with this j atom. */
811             velec            = _mm_and_ps(velec,cutoff_mask);
812             velecsum         = _mm_add_ps(velecsum,velec);
813
814             fscal            = felec;
815
816             fscal            = _mm_and_ps(fscal,cutoff_mask);
817
818             /* Calculate temporary vectorial force */
819             tx               = _mm_mul_ps(fscal,dx21);
820             ty               = _mm_mul_ps(fscal,dy21);
821             tz               = _mm_mul_ps(fscal,dz21);
822
823             /* Update vectorial force */
824             fix2             = _mm_add_ps(fix2,tx);
825             fiy2             = _mm_add_ps(fiy2,ty);
826             fiz2             = _mm_add_ps(fiz2,tz);
827
828             fjx1             = _mm_add_ps(fjx1,tx);
829             fjy1             = _mm_add_ps(fjy1,ty);
830             fjz1             = _mm_add_ps(fjz1,tz);
831
832             }
833
834             /**************************
835              * CALCULATE INTERACTIONS *
836              **************************/
837
838             if (gmx_mm_any_lt(rsq22,rcutoff2))
839             {
840
841             r22              = _mm_mul_ps(rsq22,rinv22);
842
843             /* EWALD ELECTROSTATICS */
844
845             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
846             ewrt             = _mm_mul_ps(r22,ewtabscale);
847             ewitab           = _mm_cvttps_epi32(ewrt);
848             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
849             ewitab           = _mm_slli_epi32(ewitab,2);
850             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
851             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
852             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
853             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
854             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
855             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
856             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
857             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
858             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
859
860             d                = _mm_sub_ps(r22,rswitch);
861             d                = _mm_max_ps(d,_mm_setzero_ps());
862             d2               = _mm_mul_ps(d,d);
863             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
864
865             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
866
867             /* Evaluate switch function */
868             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
869             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
870             velec            = _mm_mul_ps(velec,sw);
871             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
872
873             /* Update potential sum for this i atom from the interaction with this j atom. */
874             velec            = _mm_and_ps(velec,cutoff_mask);
875             velecsum         = _mm_add_ps(velecsum,velec);
876
877             fscal            = felec;
878
879             fscal            = _mm_and_ps(fscal,cutoff_mask);
880
881             /* Calculate temporary vectorial force */
882             tx               = _mm_mul_ps(fscal,dx22);
883             ty               = _mm_mul_ps(fscal,dy22);
884             tz               = _mm_mul_ps(fscal,dz22);
885
886             /* Update vectorial force */
887             fix2             = _mm_add_ps(fix2,tx);
888             fiy2             = _mm_add_ps(fiy2,ty);
889             fiz2             = _mm_add_ps(fiz2,tz);
890
891             fjx2             = _mm_add_ps(fjx2,tx);
892             fjy2             = _mm_add_ps(fjy2,ty);
893             fjz2             = _mm_add_ps(fjz2,tz);
894
895             }
896
897             fjptrA             = f+j_coord_offsetA;
898             fjptrB             = f+j_coord_offsetB;
899             fjptrC             = f+j_coord_offsetC;
900             fjptrD             = f+j_coord_offsetD;
901
902             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
903                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
904
905             /* Inner loop uses 603 flops */
906         }
907
908         if(jidx<j_index_end)
909         {
910
911             /* Get j neighbor index, and coordinate index */
912             jnrlistA         = jjnr[jidx];
913             jnrlistB         = jjnr[jidx+1];
914             jnrlistC         = jjnr[jidx+2];
915             jnrlistD         = jjnr[jidx+3];
916             /* Sign of each element will be negative for non-real atoms.
917              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
918              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
919              */
920             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
921             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
922             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
923             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
924             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
925             j_coord_offsetA  = DIM*jnrA;
926             j_coord_offsetB  = DIM*jnrB;
927             j_coord_offsetC  = DIM*jnrC;
928             j_coord_offsetD  = DIM*jnrD;
929
930             /* load j atom coordinates */
931             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
932                                               x+j_coord_offsetC,x+j_coord_offsetD,
933                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
934
935             /* Calculate displacement vector */
936             dx00             = _mm_sub_ps(ix0,jx0);
937             dy00             = _mm_sub_ps(iy0,jy0);
938             dz00             = _mm_sub_ps(iz0,jz0);
939             dx01             = _mm_sub_ps(ix0,jx1);
940             dy01             = _mm_sub_ps(iy0,jy1);
941             dz01             = _mm_sub_ps(iz0,jz1);
942             dx02             = _mm_sub_ps(ix0,jx2);
943             dy02             = _mm_sub_ps(iy0,jy2);
944             dz02             = _mm_sub_ps(iz0,jz2);
945             dx10             = _mm_sub_ps(ix1,jx0);
946             dy10             = _mm_sub_ps(iy1,jy0);
947             dz10             = _mm_sub_ps(iz1,jz0);
948             dx11             = _mm_sub_ps(ix1,jx1);
949             dy11             = _mm_sub_ps(iy1,jy1);
950             dz11             = _mm_sub_ps(iz1,jz1);
951             dx12             = _mm_sub_ps(ix1,jx2);
952             dy12             = _mm_sub_ps(iy1,jy2);
953             dz12             = _mm_sub_ps(iz1,jz2);
954             dx20             = _mm_sub_ps(ix2,jx0);
955             dy20             = _mm_sub_ps(iy2,jy0);
956             dz20             = _mm_sub_ps(iz2,jz0);
957             dx21             = _mm_sub_ps(ix2,jx1);
958             dy21             = _mm_sub_ps(iy2,jy1);
959             dz21             = _mm_sub_ps(iz2,jz1);
960             dx22             = _mm_sub_ps(ix2,jx2);
961             dy22             = _mm_sub_ps(iy2,jy2);
962             dz22             = _mm_sub_ps(iz2,jz2);
963
964             /* Calculate squared distance and things based on it */
965             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
966             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
967             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
968             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
969             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
970             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
971             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
972             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
973             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
974
975             rinv00           = sse41_invsqrt_f(rsq00);
976             rinv01           = sse41_invsqrt_f(rsq01);
977             rinv02           = sse41_invsqrt_f(rsq02);
978             rinv10           = sse41_invsqrt_f(rsq10);
979             rinv11           = sse41_invsqrt_f(rsq11);
980             rinv12           = sse41_invsqrt_f(rsq12);
981             rinv20           = sse41_invsqrt_f(rsq20);
982             rinv21           = sse41_invsqrt_f(rsq21);
983             rinv22           = sse41_invsqrt_f(rsq22);
984
985             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
986             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
987             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
988             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
989             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
990             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
991             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
992             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
993             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
994
995             fjx0             = _mm_setzero_ps();
996             fjy0             = _mm_setzero_ps();
997             fjz0             = _mm_setzero_ps();
998             fjx1             = _mm_setzero_ps();
999             fjy1             = _mm_setzero_ps();
1000             fjz1             = _mm_setzero_ps();
1001             fjx2             = _mm_setzero_ps();
1002             fjy2             = _mm_setzero_ps();
1003             fjz2             = _mm_setzero_ps();
1004
1005             /**************************
1006              * CALCULATE INTERACTIONS *
1007              **************************/
1008
1009             if (gmx_mm_any_lt(rsq00,rcutoff2))
1010             {
1011
1012             r00              = _mm_mul_ps(rsq00,rinv00);
1013             r00              = _mm_andnot_ps(dummy_mask,r00);
1014
1015             /* EWALD ELECTROSTATICS */
1016
1017             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1018             ewrt             = _mm_mul_ps(r00,ewtabscale);
1019             ewitab           = _mm_cvttps_epi32(ewrt);
1020             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1021             ewitab           = _mm_slli_epi32(ewitab,2);
1022             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1023             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1024             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1025             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1026             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1027             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1028             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1029             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1030             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1031
1032             /* LENNARD-JONES DISPERSION/REPULSION */
1033
1034             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1035             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1036             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1037             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1038             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1039
1040             d                = _mm_sub_ps(r00,rswitch);
1041             d                = _mm_max_ps(d,_mm_setzero_ps());
1042             d2               = _mm_mul_ps(d,d);
1043             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1044
1045             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1046
1047             /* Evaluate switch function */
1048             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1049             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1050             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1051             velec            = _mm_mul_ps(velec,sw);
1052             vvdw             = _mm_mul_ps(vvdw,sw);
1053             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1054
1055             /* Update potential sum for this i atom from the interaction with this j atom. */
1056             velec            = _mm_and_ps(velec,cutoff_mask);
1057             velec            = _mm_andnot_ps(dummy_mask,velec);
1058             velecsum         = _mm_add_ps(velecsum,velec);
1059             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
1060             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
1061             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
1062
1063             fscal            = _mm_add_ps(felec,fvdw);
1064
1065             fscal            = _mm_and_ps(fscal,cutoff_mask);
1066
1067             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1068
1069             /* Calculate temporary vectorial force */
1070             tx               = _mm_mul_ps(fscal,dx00);
1071             ty               = _mm_mul_ps(fscal,dy00);
1072             tz               = _mm_mul_ps(fscal,dz00);
1073
1074             /* Update vectorial force */
1075             fix0             = _mm_add_ps(fix0,tx);
1076             fiy0             = _mm_add_ps(fiy0,ty);
1077             fiz0             = _mm_add_ps(fiz0,tz);
1078
1079             fjx0             = _mm_add_ps(fjx0,tx);
1080             fjy0             = _mm_add_ps(fjy0,ty);
1081             fjz0             = _mm_add_ps(fjz0,tz);
1082
1083             }
1084
1085             /**************************
1086              * CALCULATE INTERACTIONS *
1087              **************************/
1088
1089             if (gmx_mm_any_lt(rsq01,rcutoff2))
1090             {
1091
1092             r01              = _mm_mul_ps(rsq01,rinv01);
1093             r01              = _mm_andnot_ps(dummy_mask,r01);
1094
1095             /* EWALD ELECTROSTATICS */
1096
1097             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1098             ewrt             = _mm_mul_ps(r01,ewtabscale);
1099             ewitab           = _mm_cvttps_epi32(ewrt);
1100             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1101             ewitab           = _mm_slli_epi32(ewitab,2);
1102             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1103             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1104             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1105             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1106             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1107             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1108             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1109             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1110             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1111
1112             d                = _mm_sub_ps(r01,rswitch);
1113             d                = _mm_max_ps(d,_mm_setzero_ps());
1114             d2               = _mm_mul_ps(d,d);
1115             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1116
1117             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1118
1119             /* Evaluate switch function */
1120             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1121             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1122             velec            = _mm_mul_ps(velec,sw);
1123             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1124
1125             /* Update potential sum for this i atom from the interaction with this j atom. */
1126             velec            = _mm_and_ps(velec,cutoff_mask);
1127             velec            = _mm_andnot_ps(dummy_mask,velec);
1128             velecsum         = _mm_add_ps(velecsum,velec);
1129
1130             fscal            = felec;
1131
1132             fscal            = _mm_and_ps(fscal,cutoff_mask);
1133
1134             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1135
1136             /* Calculate temporary vectorial force */
1137             tx               = _mm_mul_ps(fscal,dx01);
1138             ty               = _mm_mul_ps(fscal,dy01);
1139             tz               = _mm_mul_ps(fscal,dz01);
1140
1141             /* Update vectorial force */
1142             fix0             = _mm_add_ps(fix0,tx);
1143             fiy0             = _mm_add_ps(fiy0,ty);
1144             fiz0             = _mm_add_ps(fiz0,tz);
1145
1146             fjx1             = _mm_add_ps(fjx1,tx);
1147             fjy1             = _mm_add_ps(fjy1,ty);
1148             fjz1             = _mm_add_ps(fjz1,tz);
1149
1150             }
1151
1152             /**************************
1153              * CALCULATE INTERACTIONS *
1154              **************************/
1155
1156             if (gmx_mm_any_lt(rsq02,rcutoff2))
1157             {
1158
1159             r02              = _mm_mul_ps(rsq02,rinv02);
1160             r02              = _mm_andnot_ps(dummy_mask,r02);
1161
1162             /* EWALD ELECTROSTATICS */
1163
1164             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1165             ewrt             = _mm_mul_ps(r02,ewtabscale);
1166             ewitab           = _mm_cvttps_epi32(ewrt);
1167             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1168             ewitab           = _mm_slli_epi32(ewitab,2);
1169             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1170             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1171             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1172             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1173             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1174             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1175             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1176             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
1177             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1178
1179             d                = _mm_sub_ps(r02,rswitch);
1180             d                = _mm_max_ps(d,_mm_setzero_ps());
1181             d2               = _mm_mul_ps(d,d);
1182             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1183
1184             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1185
1186             /* Evaluate switch function */
1187             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1188             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1189             velec            = _mm_mul_ps(velec,sw);
1190             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1191
1192             /* Update potential sum for this i atom from the interaction with this j atom. */
1193             velec            = _mm_and_ps(velec,cutoff_mask);
1194             velec            = _mm_andnot_ps(dummy_mask,velec);
1195             velecsum         = _mm_add_ps(velecsum,velec);
1196
1197             fscal            = felec;
1198
1199             fscal            = _mm_and_ps(fscal,cutoff_mask);
1200
1201             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1202
1203             /* Calculate temporary vectorial force */
1204             tx               = _mm_mul_ps(fscal,dx02);
1205             ty               = _mm_mul_ps(fscal,dy02);
1206             tz               = _mm_mul_ps(fscal,dz02);
1207
1208             /* Update vectorial force */
1209             fix0             = _mm_add_ps(fix0,tx);
1210             fiy0             = _mm_add_ps(fiy0,ty);
1211             fiz0             = _mm_add_ps(fiz0,tz);
1212
1213             fjx2             = _mm_add_ps(fjx2,tx);
1214             fjy2             = _mm_add_ps(fjy2,ty);
1215             fjz2             = _mm_add_ps(fjz2,tz);
1216
1217             }
1218
1219             /**************************
1220              * CALCULATE INTERACTIONS *
1221              **************************/
1222
1223             if (gmx_mm_any_lt(rsq10,rcutoff2))
1224             {
1225
1226             r10              = _mm_mul_ps(rsq10,rinv10);
1227             r10              = _mm_andnot_ps(dummy_mask,r10);
1228
1229             /* EWALD ELECTROSTATICS */
1230
1231             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1232             ewrt             = _mm_mul_ps(r10,ewtabscale);
1233             ewitab           = _mm_cvttps_epi32(ewrt);
1234             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1235             ewitab           = _mm_slli_epi32(ewitab,2);
1236             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1237             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1238             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1239             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1240             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1241             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1242             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1243             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1244             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1245
1246             d                = _mm_sub_ps(r10,rswitch);
1247             d                = _mm_max_ps(d,_mm_setzero_ps());
1248             d2               = _mm_mul_ps(d,d);
1249             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1250
1251             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1252
1253             /* Evaluate switch function */
1254             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1255             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1256             velec            = _mm_mul_ps(velec,sw);
1257             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1258
1259             /* Update potential sum for this i atom from the interaction with this j atom. */
1260             velec            = _mm_and_ps(velec,cutoff_mask);
1261             velec            = _mm_andnot_ps(dummy_mask,velec);
1262             velecsum         = _mm_add_ps(velecsum,velec);
1263
1264             fscal            = felec;
1265
1266             fscal            = _mm_and_ps(fscal,cutoff_mask);
1267
1268             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1269
1270             /* Calculate temporary vectorial force */
1271             tx               = _mm_mul_ps(fscal,dx10);
1272             ty               = _mm_mul_ps(fscal,dy10);
1273             tz               = _mm_mul_ps(fscal,dz10);
1274
1275             /* Update vectorial force */
1276             fix1             = _mm_add_ps(fix1,tx);
1277             fiy1             = _mm_add_ps(fiy1,ty);
1278             fiz1             = _mm_add_ps(fiz1,tz);
1279
1280             fjx0             = _mm_add_ps(fjx0,tx);
1281             fjy0             = _mm_add_ps(fjy0,ty);
1282             fjz0             = _mm_add_ps(fjz0,tz);
1283
1284             }
1285
1286             /**************************
1287              * CALCULATE INTERACTIONS *
1288              **************************/
1289
1290             if (gmx_mm_any_lt(rsq11,rcutoff2))
1291             {
1292
1293             r11              = _mm_mul_ps(rsq11,rinv11);
1294             r11              = _mm_andnot_ps(dummy_mask,r11);
1295
1296             /* EWALD ELECTROSTATICS */
1297
1298             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1299             ewrt             = _mm_mul_ps(r11,ewtabscale);
1300             ewitab           = _mm_cvttps_epi32(ewrt);
1301             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1302             ewitab           = _mm_slli_epi32(ewitab,2);
1303             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1304             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1305             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1306             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1307             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1308             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1309             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1310             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1311             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1312
1313             d                = _mm_sub_ps(r11,rswitch);
1314             d                = _mm_max_ps(d,_mm_setzero_ps());
1315             d2               = _mm_mul_ps(d,d);
1316             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1317
1318             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1319
1320             /* Evaluate switch function */
1321             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1322             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1323             velec            = _mm_mul_ps(velec,sw);
1324             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1325
1326             /* Update potential sum for this i atom from the interaction with this j atom. */
1327             velec            = _mm_and_ps(velec,cutoff_mask);
1328             velec            = _mm_andnot_ps(dummy_mask,velec);
1329             velecsum         = _mm_add_ps(velecsum,velec);
1330
1331             fscal            = felec;
1332
1333             fscal            = _mm_and_ps(fscal,cutoff_mask);
1334
1335             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1336
1337             /* Calculate temporary vectorial force */
1338             tx               = _mm_mul_ps(fscal,dx11);
1339             ty               = _mm_mul_ps(fscal,dy11);
1340             tz               = _mm_mul_ps(fscal,dz11);
1341
1342             /* Update vectorial force */
1343             fix1             = _mm_add_ps(fix1,tx);
1344             fiy1             = _mm_add_ps(fiy1,ty);
1345             fiz1             = _mm_add_ps(fiz1,tz);
1346
1347             fjx1             = _mm_add_ps(fjx1,tx);
1348             fjy1             = _mm_add_ps(fjy1,ty);
1349             fjz1             = _mm_add_ps(fjz1,tz);
1350
1351             }
1352
1353             /**************************
1354              * CALCULATE INTERACTIONS *
1355              **************************/
1356
1357             if (gmx_mm_any_lt(rsq12,rcutoff2))
1358             {
1359
1360             r12              = _mm_mul_ps(rsq12,rinv12);
1361             r12              = _mm_andnot_ps(dummy_mask,r12);
1362
1363             /* EWALD ELECTROSTATICS */
1364
1365             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1366             ewrt             = _mm_mul_ps(r12,ewtabscale);
1367             ewitab           = _mm_cvttps_epi32(ewrt);
1368             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1369             ewitab           = _mm_slli_epi32(ewitab,2);
1370             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1371             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1372             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1373             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1374             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1375             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1376             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1377             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1378             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1379
1380             d                = _mm_sub_ps(r12,rswitch);
1381             d                = _mm_max_ps(d,_mm_setzero_ps());
1382             d2               = _mm_mul_ps(d,d);
1383             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1384
1385             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1386
1387             /* Evaluate switch function */
1388             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1389             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1390             velec            = _mm_mul_ps(velec,sw);
1391             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1392
1393             /* Update potential sum for this i atom from the interaction with this j atom. */
1394             velec            = _mm_and_ps(velec,cutoff_mask);
1395             velec            = _mm_andnot_ps(dummy_mask,velec);
1396             velecsum         = _mm_add_ps(velecsum,velec);
1397
1398             fscal            = felec;
1399
1400             fscal            = _mm_and_ps(fscal,cutoff_mask);
1401
1402             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1403
1404             /* Calculate temporary vectorial force */
1405             tx               = _mm_mul_ps(fscal,dx12);
1406             ty               = _mm_mul_ps(fscal,dy12);
1407             tz               = _mm_mul_ps(fscal,dz12);
1408
1409             /* Update vectorial force */
1410             fix1             = _mm_add_ps(fix1,tx);
1411             fiy1             = _mm_add_ps(fiy1,ty);
1412             fiz1             = _mm_add_ps(fiz1,tz);
1413
1414             fjx2             = _mm_add_ps(fjx2,tx);
1415             fjy2             = _mm_add_ps(fjy2,ty);
1416             fjz2             = _mm_add_ps(fjz2,tz);
1417
1418             }
1419
1420             /**************************
1421              * CALCULATE INTERACTIONS *
1422              **************************/
1423
1424             if (gmx_mm_any_lt(rsq20,rcutoff2))
1425             {
1426
1427             r20              = _mm_mul_ps(rsq20,rinv20);
1428             r20              = _mm_andnot_ps(dummy_mask,r20);
1429
1430             /* EWALD ELECTROSTATICS */
1431
1432             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1433             ewrt             = _mm_mul_ps(r20,ewtabscale);
1434             ewitab           = _mm_cvttps_epi32(ewrt);
1435             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1436             ewitab           = _mm_slli_epi32(ewitab,2);
1437             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1438             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1439             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1440             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1441             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1442             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1443             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1444             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1445             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1446
1447             d                = _mm_sub_ps(r20,rswitch);
1448             d                = _mm_max_ps(d,_mm_setzero_ps());
1449             d2               = _mm_mul_ps(d,d);
1450             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1451
1452             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1453
1454             /* Evaluate switch function */
1455             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1456             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1457             velec            = _mm_mul_ps(velec,sw);
1458             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1459
1460             /* Update potential sum for this i atom from the interaction with this j atom. */
1461             velec            = _mm_and_ps(velec,cutoff_mask);
1462             velec            = _mm_andnot_ps(dummy_mask,velec);
1463             velecsum         = _mm_add_ps(velecsum,velec);
1464
1465             fscal            = felec;
1466
1467             fscal            = _mm_and_ps(fscal,cutoff_mask);
1468
1469             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1470
1471             /* Calculate temporary vectorial force */
1472             tx               = _mm_mul_ps(fscal,dx20);
1473             ty               = _mm_mul_ps(fscal,dy20);
1474             tz               = _mm_mul_ps(fscal,dz20);
1475
1476             /* Update vectorial force */
1477             fix2             = _mm_add_ps(fix2,tx);
1478             fiy2             = _mm_add_ps(fiy2,ty);
1479             fiz2             = _mm_add_ps(fiz2,tz);
1480
1481             fjx0             = _mm_add_ps(fjx0,tx);
1482             fjy0             = _mm_add_ps(fjy0,ty);
1483             fjz0             = _mm_add_ps(fjz0,tz);
1484
1485             }
1486
1487             /**************************
1488              * CALCULATE INTERACTIONS *
1489              **************************/
1490
1491             if (gmx_mm_any_lt(rsq21,rcutoff2))
1492             {
1493
1494             r21              = _mm_mul_ps(rsq21,rinv21);
1495             r21              = _mm_andnot_ps(dummy_mask,r21);
1496
1497             /* EWALD ELECTROSTATICS */
1498
1499             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1500             ewrt             = _mm_mul_ps(r21,ewtabscale);
1501             ewitab           = _mm_cvttps_epi32(ewrt);
1502             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1503             ewitab           = _mm_slli_epi32(ewitab,2);
1504             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1505             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1506             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1507             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1508             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1509             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1510             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1511             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1512             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1513
1514             d                = _mm_sub_ps(r21,rswitch);
1515             d                = _mm_max_ps(d,_mm_setzero_ps());
1516             d2               = _mm_mul_ps(d,d);
1517             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1518
1519             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1520
1521             /* Evaluate switch function */
1522             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1523             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1524             velec            = _mm_mul_ps(velec,sw);
1525             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1526
1527             /* Update potential sum for this i atom from the interaction with this j atom. */
1528             velec            = _mm_and_ps(velec,cutoff_mask);
1529             velec            = _mm_andnot_ps(dummy_mask,velec);
1530             velecsum         = _mm_add_ps(velecsum,velec);
1531
1532             fscal            = felec;
1533
1534             fscal            = _mm_and_ps(fscal,cutoff_mask);
1535
1536             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1537
1538             /* Calculate temporary vectorial force */
1539             tx               = _mm_mul_ps(fscal,dx21);
1540             ty               = _mm_mul_ps(fscal,dy21);
1541             tz               = _mm_mul_ps(fscal,dz21);
1542
1543             /* Update vectorial force */
1544             fix2             = _mm_add_ps(fix2,tx);
1545             fiy2             = _mm_add_ps(fiy2,ty);
1546             fiz2             = _mm_add_ps(fiz2,tz);
1547
1548             fjx1             = _mm_add_ps(fjx1,tx);
1549             fjy1             = _mm_add_ps(fjy1,ty);
1550             fjz1             = _mm_add_ps(fjz1,tz);
1551
1552             }
1553
1554             /**************************
1555              * CALCULATE INTERACTIONS *
1556              **************************/
1557
1558             if (gmx_mm_any_lt(rsq22,rcutoff2))
1559             {
1560
1561             r22              = _mm_mul_ps(rsq22,rinv22);
1562             r22              = _mm_andnot_ps(dummy_mask,r22);
1563
1564             /* EWALD ELECTROSTATICS */
1565
1566             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1567             ewrt             = _mm_mul_ps(r22,ewtabscale);
1568             ewitab           = _mm_cvttps_epi32(ewrt);
1569             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1570             ewitab           = _mm_slli_epi32(ewitab,2);
1571             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1572             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1573             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1574             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1575             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1576             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1577             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1578             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1579             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1580
1581             d                = _mm_sub_ps(r22,rswitch);
1582             d                = _mm_max_ps(d,_mm_setzero_ps());
1583             d2               = _mm_mul_ps(d,d);
1584             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1585
1586             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1587
1588             /* Evaluate switch function */
1589             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1590             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1591             velec            = _mm_mul_ps(velec,sw);
1592             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1593
1594             /* Update potential sum for this i atom from the interaction with this j atom. */
1595             velec            = _mm_and_ps(velec,cutoff_mask);
1596             velec            = _mm_andnot_ps(dummy_mask,velec);
1597             velecsum         = _mm_add_ps(velecsum,velec);
1598
1599             fscal            = felec;
1600
1601             fscal            = _mm_and_ps(fscal,cutoff_mask);
1602
1603             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1604
1605             /* Calculate temporary vectorial force */
1606             tx               = _mm_mul_ps(fscal,dx22);
1607             ty               = _mm_mul_ps(fscal,dy22);
1608             tz               = _mm_mul_ps(fscal,dz22);
1609
1610             /* Update vectorial force */
1611             fix2             = _mm_add_ps(fix2,tx);
1612             fiy2             = _mm_add_ps(fiy2,ty);
1613             fiz2             = _mm_add_ps(fiz2,tz);
1614
1615             fjx2             = _mm_add_ps(fjx2,tx);
1616             fjy2             = _mm_add_ps(fjy2,ty);
1617             fjz2             = _mm_add_ps(fjz2,tz);
1618
1619             }
1620
1621             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1622             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1623             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1624             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1625
1626             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1627                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1628
1629             /* Inner loop uses 612 flops */
1630         }
1631
1632         /* End of innermost loop */
1633
1634         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1635                                               f+i_coord_offset,fshift+i_shift_offset);
1636
1637         ggid                        = gid[iidx];
1638         /* Update potential energies */
1639         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1640         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1641
1642         /* Increment number of inner iterations */
1643         inneriter                  += j_index_end - j_index_start;
1644
1645         /* Outer loop uses 20 flops */
1646     }
1647
1648     /* Increment number of outer iterations */
1649     outeriter        += nri;
1650
1651     /* Update outer/inner flops */
1652
1653     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*612);
1654 }
1655 /*
1656  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_single
1657  * Electrostatics interaction: Ewald
1658  * VdW interaction:            LennardJones
1659  * Geometry:                   Water3-Water3
1660  * Calculate force/pot:        Force
1661  */
1662 void
1663 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse4_1_single
1664                     (t_nblist                    * gmx_restrict       nlist,
1665                      rvec                        * gmx_restrict          xx,
1666                      rvec                        * gmx_restrict          ff,
1667                      struct t_forcerec           * gmx_restrict          fr,
1668                      t_mdatoms                   * gmx_restrict     mdatoms,
1669                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1670                      t_nrnb                      * gmx_restrict        nrnb)
1671 {
1672     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1673      * just 0 for non-waters.
1674      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1675      * jnr indices corresponding to data put in the four positions in the SIMD register.
1676      */
1677     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1678     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1679     int              jnrA,jnrB,jnrC,jnrD;
1680     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1681     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1682     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1683     real             rcutoff_scalar;
1684     real             *shiftvec,*fshift,*x,*f;
1685     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1686     real             scratch[4*DIM];
1687     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1688     int              vdwioffset0;
1689     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1690     int              vdwioffset1;
1691     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1692     int              vdwioffset2;
1693     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1694     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1695     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1696     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1697     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1698     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1699     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1700     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1701     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1702     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1703     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1704     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1705     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1706     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1707     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1708     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1709     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1710     real             *charge;
1711     int              nvdwtype;
1712     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1713     int              *vdwtype;
1714     real             *vdwparam;
1715     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1716     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1717     __m128i          ewitab;
1718     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1719     real             *ewtab;
1720     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1721     real             rswitch_scalar,d_scalar;
1722     __m128           dummy_mask,cutoff_mask;
1723     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1724     __m128           one     = _mm_set1_ps(1.0);
1725     __m128           two     = _mm_set1_ps(2.0);
1726     x                = xx[0];
1727     f                = ff[0];
1728
1729     nri              = nlist->nri;
1730     iinr             = nlist->iinr;
1731     jindex           = nlist->jindex;
1732     jjnr             = nlist->jjnr;
1733     shiftidx         = nlist->shift;
1734     gid              = nlist->gid;
1735     shiftvec         = fr->shift_vec[0];
1736     fshift           = fr->fshift[0];
1737     facel            = _mm_set1_ps(fr->ic->epsfac);
1738     charge           = mdatoms->chargeA;
1739     nvdwtype         = fr->ntype;
1740     vdwparam         = fr->nbfp;
1741     vdwtype          = mdatoms->typeA;
1742
1743     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1744     ewtab            = fr->ic->tabq_coul_FDV0;
1745     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1746     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1747
1748     /* Setup water-specific parameters */
1749     inr              = nlist->iinr[0];
1750     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1751     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1752     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1753     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1754
1755     jq0              = _mm_set1_ps(charge[inr+0]);
1756     jq1              = _mm_set1_ps(charge[inr+1]);
1757     jq2              = _mm_set1_ps(charge[inr+2]);
1758     vdwjidx0A        = 2*vdwtype[inr+0];
1759     qq00             = _mm_mul_ps(iq0,jq0);
1760     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1761     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1762     qq01             = _mm_mul_ps(iq0,jq1);
1763     qq02             = _mm_mul_ps(iq0,jq2);
1764     qq10             = _mm_mul_ps(iq1,jq0);
1765     qq11             = _mm_mul_ps(iq1,jq1);
1766     qq12             = _mm_mul_ps(iq1,jq2);
1767     qq20             = _mm_mul_ps(iq2,jq0);
1768     qq21             = _mm_mul_ps(iq2,jq1);
1769     qq22             = _mm_mul_ps(iq2,jq2);
1770
1771     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1772     rcutoff_scalar   = fr->ic->rcoulomb;
1773     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1774     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1775
1776     rswitch_scalar   = fr->ic->rcoulomb_switch;
1777     rswitch          = _mm_set1_ps(rswitch_scalar);
1778     /* Setup switch parameters */
1779     d_scalar         = rcutoff_scalar-rswitch_scalar;
1780     d                = _mm_set1_ps(d_scalar);
1781     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1782     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1783     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1784     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1785     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1786     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1787
1788     /* Avoid stupid compiler warnings */
1789     jnrA = jnrB = jnrC = jnrD = 0;
1790     j_coord_offsetA = 0;
1791     j_coord_offsetB = 0;
1792     j_coord_offsetC = 0;
1793     j_coord_offsetD = 0;
1794
1795     outeriter        = 0;
1796     inneriter        = 0;
1797
1798     for(iidx=0;iidx<4*DIM;iidx++)
1799     {
1800         scratch[iidx] = 0.0;
1801     }
1802
1803     /* Start outer loop over neighborlists */
1804     for(iidx=0; iidx<nri; iidx++)
1805     {
1806         /* Load shift vector for this list */
1807         i_shift_offset   = DIM*shiftidx[iidx];
1808
1809         /* Load limits for loop over neighbors */
1810         j_index_start    = jindex[iidx];
1811         j_index_end      = jindex[iidx+1];
1812
1813         /* Get outer coordinate index */
1814         inr              = iinr[iidx];
1815         i_coord_offset   = DIM*inr;
1816
1817         /* Load i particle coords and add shift vector */
1818         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1819                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1820
1821         fix0             = _mm_setzero_ps();
1822         fiy0             = _mm_setzero_ps();
1823         fiz0             = _mm_setzero_ps();
1824         fix1             = _mm_setzero_ps();
1825         fiy1             = _mm_setzero_ps();
1826         fiz1             = _mm_setzero_ps();
1827         fix2             = _mm_setzero_ps();
1828         fiy2             = _mm_setzero_ps();
1829         fiz2             = _mm_setzero_ps();
1830
1831         /* Start inner kernel loop */
1832         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1833         {
1834
1835             /* Get j neighbor index, and coordinate index */
1836             jnrA             = jjnr[jidx];
1837             jnrB             = jjnr[jidx+1];
1838             jnrC             = jjnr[jidx+2];
1839             jnrD             = jjnr[jidx+3];
1840             j_coord_offsetA  = DIM*jnrA;
1841             j_coord_offsetB  = DIM*jnrB;
1842             j_coord_offsetC  = DIM*jnrC;
1843             j_coord_offsetD  = DIM*jnrD;
1844
1845             /* load j atom coordinates */
1846             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1847                                               x+j_coord_offsetC,x+j_coord_offsetD,
1848                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1849
1850             /* Calculate displacement vector */
1851             dx00             = _mm_sub_ps(ix0,jx0);
1852             dy00             = _mm_sub_ps(iy0,jy0);
1853             dz00             = _mm_sub_ps(iz0,jz0);
1854             dx01             = _mm_sub_ps(ix0,jx1);
1855             dy01             = _mm_sub_ps(iy0,jy1);
1856             dz01             = _mm_sub_ps(iz0,jz1);
1857             dx02             = _mm_sub_ps(ix0,jx2);
1858             dy02             = _mm_sub_ps(iy0,jy2);
1859             dz02             = _mm_sub_ps(iz0,jz2);
1860             dx10             = _mm_sub_ps(ix1,jx0);
1861             dy10             = _mm_sub_ps(iy1,jy0);
1862             dz10             = _mm_sub_ps(iz1,jz0);
1863             dx11             = _mm_sub_ps(ix1,jx1);
1864             dy11             = _mm_sub_ps(iy1,jy1);
1865             dz11             = _mm_sub_ps(iz1,jz1);
1866             dx12             = _mm_sub_ps(ix1,jx2);
1867             dy12             = _mm_sub_ps(iy1,jy2);
1868             dz12             = _mm_sub_ps(iz1,jz2);
1869             dx20             = _mm_sub_ps(ix2,jx0);
1870             dy20             = _mm_sub_ps(iy2,jy0);
1871             dz20             = _mm_sub_ps(iz2,jz0);
1872             dx21             = _mm_sub_ps(ix2,jx1);
1873             dy21             = _mm_sub_ps(iy2,jy1);
1874             dz21             = _mm_sub_ps(iz2,jz1);
1875             dx22             = _mm_sub_ps(ix2,jx2);
1876             dy22             = _mm_sub_ps(iy2,jy2);
1877             dz22             = _mm_sub_ps(iz2,jz2);
1878
1879             /* Calculate squared distance and things based on it */
1880             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1881             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1882             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1883             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1884             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1885             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1886             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1887             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1888             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1889
1890             rinv00           = sse41_invsqrt_f(rsq00);
1891             rinv01           = sse41_invsqrt_f(rsq01);
1892             rinv02           = sse41_invsqrt_f(rsq02);
1893             rinv10           = sse41_invsqrt_f(rsq10);
1894             rinv11           = sse41_invsqrt_f(rsq11);
1895             rinv12           = sse41_invsqrt_f(rsq12);
1896             rinv20           = sse41_invsqrt_f(rsq20);
1897             rinv21           = sse41_invsqrt_f(rsq21);
1898             rinv22           = sse41_invsqrt_f(rsq22);
1899
1900             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1901             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1902             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1903             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1904             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1905             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1906             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1907             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1908             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1909
1910             fjx0             = _mm_setzero_ps();
1911             fjy0             = _mm_setzero_ps();
1912             fjz0             = _mm_setzero_ps();
1913             fjx1             = _mm_setzero_ps();
1914             fjy1             = _mm_setzero_ps();
1915             fjz1             = _mm_setzero_ps();
1916             fjx2             = _mm_setzero_ps();
1917             fjy2             = _mm_setzero_ps();
1918             fjz2             = _mm_setzero_ps();
1919
1920             /**************************
1921              * CALCULATE INTERACTIONS *
1922              **************************/
1923
1924             if (gmx_mm_any_lt(rsq00,rcutoff2))
1925             {
1926
1927             r00              = _mm_mul_ps(rsq00,rinv00);
1928
1929             /* EWALD ELECTROSTATICS */
1930
1931             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1932             ewrt             = _mm_mul_ps(r00,ewtabscale);
1933             ewitab           = _mm_cvttps_epi32(ewrt);
1934             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
1935             ewitab           = _mm_slli_epi32(ewitab,2);
1936             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1937             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1938             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1939             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1940             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1941             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1942             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1943             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1944             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1945
1946             /* LENNARD-JONES DISPERSION/REPULSION */
1947
1948             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1949             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1950             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1951             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1952             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1953
1954             d                = _mm_sub_ps(r00,rswitch);
1955             d                = _mm_max_ps(d,_mm_setzero_ps());
1956             d2               = _mm_mul_ps(d,d);
1957             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1958
1959             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1960
1961             /* Evaluate switch function */
1962             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1963             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1964             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1965             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1966
1967             fscal            = _mm_add_ps(felec,fvdw);
1968
1969             fscal            = _mm_and_ps(fscal,cutoff_mask);
1970
1971             /* Calculate temporary vectorial force */
1972             tx               = _mm_mul_ps(fscal,dx00);
1973             ty               = _mm_mul_ps(fscal,dy00);
1974             tz               = _mm_mul_ps(fscal,dz00);
1975
1976             /* Update vectorial force */
1977             fix0             = _mm_add_ps(fix0,tx);
1978             fiy0             = _mm_add_ps(fiy0,ty);
1979             fiz0             = _mm_add_ps(fiz0,tz);
1980
1981             fjx0             = _mm_add_ps(fjx0,tx);
1982             fjy0             = _mm_add_ps(fjy0,ty);
1983             fjz0             = _mm_add_ps(fjz0,tz);
1984
1985             }
1986
1987             /**************************
1988              * CALCULATE INTERACTIONS *
1989              **************************/
1990
1991             if (gmx_mm_any_lt(rsq01,rcutoff2))
1992             {
1993
1994             r01              = _mm_mul_ps(rsq01,rinv01);
1995
1996             /* EWALD ELECTROSTATICS */
1997
1998             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1999             ewrt             = _mm_mul_ps(r01,ewtabscale);
2000             ewitab           = _mm_cvttps_epi32(ewrt);
2001             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2002             ewitab           = _mm_slli_epi32(ewitab,2);
2003             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2004             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2005             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2006             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2007             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2008             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2009             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2010             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
2011             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2012
2013             d                = _mm_sub_ps(r01,rswitch);
2014             d                = _mm_max_ps(d,_mm_setzero_ps());
2015             d2               = _mm_mul_ps(d,d);
2016             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2017
2018             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2019
2020             /* Evaluate switch function */
2021             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2022             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2023             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
2024
2025             fscal            = felec;
2026
2027             fscal            = _mm_and_ps(fscal,cutoff_mask);
2028
2029             /* Calculate temporary vectorial force */
2030             tx               = _mm_mul_ps(fscal,dx01);
2031             ty               = _mm_mul_ps(fscal,dy01);
2032             tz               = _mm_mul_ps(fscal,dz01);
2033
2034             /* Update vectorial force */
2035             fix0             = _mm_add_ps(fix0,tx);
2036             fiy0             = _mm_add_ps(fiy0,ty);
2037             fiz0             = _mm_add_ps(fiz0,tz);
2038
2039             fjx1             = _mm_add_ps(fjx1,tx);
2040             fjy1             = _mm_add_ps(fjy1,ty);
2041             fjz1             = _mm_add_ps(fjz1,tz);
2042
2043             }
2044
2045             /**************************
2046              * CALCULATE INTERACTIONS *
2047              **************************/
2048
2049             if (gmx_mm_any_lt(rsq02,rcutoff2))
2050             {
2051
2052             r02              = _mm_mul_ps(rsq02,rinv02);
2053
2054             /* EWALD ELECTROSTATICS */
2055
2056             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2057             ewrt             = _mm_mul_ps(r02,ewtabscale);
2058             ewitab           = _mm_cvttps_epi32(ewrt);
2059             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2060             ewitab           = _mm_slli_epi32(ewitab,2);
2061             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2062             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2063             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2064             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2065             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2066             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2067             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2068             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2069             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2070
2071             d                = _mm_sub_ps(r02,rswitch);
2072             d                = _mm_max_ps(d,_mm_setzero_ps());
2073             d2               = _mm_mul_ps(d,d);
2074             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2075
2076             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2077
2078             /* Evaluate switch function */
2079             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2080             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2081             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
2082
2083             fscal            = felec;
2084
2085             fscal            = _mm_and_ps(fscal,cutoff_mask);
2086
2087             /* Calculate temporary vectorial force */
2088             tx               = _mm_mul_ps(fscal,dx02);
2089             ty               = _mm_mul_ps(fscal,dy02);
2090             tz               = _mm_mul_ps(fscal,dz02);
2091
2092             /* Update vectorial force */
2093             fix0             = _mm_add_ps(fix0,tx);
2094             fiy0             = _mm_add_ps(fiy0,ty);
2095             fiz0             = _mm_add_ps(fiz0,tz);
2096
2097             fjx2             = _mm_add_ps(fjx2,tx);
2098             fjy2             = _mm_add_ps(fjy2,ty);
2099             fjz2             = _mm_add_ps(fjz2,tz);
2100
2101             }
2102
2103             /**************************
2104              * CALCULATE INTERACTIONS *
2105              **************************/
2106
2107             if (gmx_mm_any_lt(rsq10,rcutoff2))
2108             {
2109
2110             r10              = _mm_mul_ps(rsq10,rinv10);
2111
2112             /* EWALD ELECTROSTATICS */
2113
2114             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2115             ewrt             = _mm_mul_ps(r10,ewtabscale);
2116             ewitab           = _mm_cvttps_epi32(ewrt);
2117             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2118             ewitab           = _mm_slli_epi32(ewitab,2);
2119             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2120             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2121             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2122             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2123             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2124             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2125             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2126             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2127             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2128
2129             d                = _mm_sub_ps(r10,rswitch);
2130             d                = _mm_max_ps(d,_mm_setzero_ps());
2131             d2               = _mm_mul_ps(d,d);
2132             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2133
2134             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2135
2136             /* Evaluate switch function */
2137             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2138             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2139             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
2140
2141             fscal            = felec;
2142
2143             fscal            = _mm_and_ps(fscal,cutoff_mask);
2144
2145             /* Calculate temporary vectorial force */
2146             tx               = _mm_mul_ps(fscal,dx10);
2147             ty               = _mm_mul_ps(fscal,dy10);
2148             tz               = _mm_mul_ps(fscal,dz10);
2149
2150             /* Update vectorial force */
2151             fix1             = _mm_add_ps(fix1,tx);
2152             fiy1             = _mm_add_ps(fiy1,ty);
2153             fiz1             = _mm_add_ps(fiz1,tz);
2154
2155             fjx0             = _mm_add_ps(fjx0,tx);
2156             fjy0             = _mm_add_ps(fjy0,ty);
2157             fjz0             = _mm_add_ps(fjz0,tz);
2158
2159             }
2160
2161             /**************************
2162              * CALCULATE INTERACTIONS *
2163              **************************/
2164
2165             if (gmx_mm_any_lt(rsq11,rcutoff2))
2166             {
2167
2168             r11              = _mm_mul_ps(rsq11,rinv11);
2169
2170             /* EWALD ELECTROSTATICS */
2171
2172             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2173             ewrt             = _mm_mul_ps(r11,ewtabscale);
2174             ewitab           = _mm_cvttps_epi32(ewrt);
2175             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2176             ewitab           = _mm_slli_epi32(ewitab,2);
2177             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2178             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2179             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2180             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2181             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2182             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2183             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2184             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2185             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2186
2187             d                = _mm_sub_ps(r11,rswitch);
2188             d                = _mm_max_ps(d,_mm_setzero_ps());
2189             d2               = _mm_mul_ps(d,d);
2190             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2191
2192             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2193
2194             /* Evaluate switch function */
2195             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2196             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2197             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2198
2199             fscal            = felec;
2200
2201             fscal            = _mm_and_ps(fscal,cutoff_mask);
2202
2203             /* Calculate temporary vectorial force */
2204             tx               = _mm_mul_ps(fscal,dx11);
2205             ty               = _mm_mul_ps(fscal,dy11);
2206             tz               = _mm_mul_ps(fscal,dz11);
2207
2208             /* Update vectorial force */
2209             fix1             = _mm_add_ps(fix1,tx);
2210             fiy1             = _mm_add_ps(fiy1,ty);
2211             fiz1             = _mm_add_ps(fiz1,tz);
2212
2213             fjx1             = _mm_add_ps(fjx1,tx);
2214             fjy1             = _mm_add_ps(fjy1,ty);
2215             fjz1             = _mm_add_ps(fjz1,tz);
2216
2217             }
2218
2219             /**************************
2220              * CALCULATE INTERACTIONS *
2221              **************************/
2222
2223             if (gmx_mm_any_lt(rsq12,rcutoff2))
2224             {
2225
2226             r12              = _mm_mul_ps(rsq12,rinv12);
2227
2228             /* EWALD ELECTROSTATICS */
2229
2230             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2231             ewrt             = _mm_mul_ps(r12,ewtabscale);
2232             ewitab           = _mm_cvttps_epi32(ewrt);
2233             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2234             ewitab           = _mm_slli_epi32(ewitab,2);
2235             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2236             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2237             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2238             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2239             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2240             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2241             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2242             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2243             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2244
2245             d                = _mm_sub_ps(r12,rswitch);
2246             d                = _mm_max_ps(d,_mm_setzero_ps());
2247             d2               = _mm_mul_ps(d,d);
2248             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2249
2250             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2251
2252             /* Evaluate switch function */
2253             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2254             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2255             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2256
2257             fscal            = felec;
2258
2259             fscal            = _mm_and_ps(fscal,cutoff_mask);
2260
2261             /* Calculate temporary vectorial force */
2262             tx               = _mm_mul_ps(fscal,dx12);
2263             ty               = _mm_mul_ps(fscal,dy12);
2264             tz               = _mm_mul_ps(fscal,dz12);
2265
2266             /* Update vectorial force */
2267             fix1             = _mm_add_ps(fix1,tx);
2268             fiy1             = _mm_add_ps(fiy1,ty);
2269             fiz1             = _mm_add_ps(fiz1,tz);
2270
2271             fjx2             = _mm_add_ps(fjx2,tx);
2272             fjy2             = _mm_add_ps(fjy2,ty);
2273             fjz2             = _mm_add_ps(fjz2,tz);
2274
2275             }
2276
2277             /**************************
2278              * CALCULATE INTERACTIONS *
2279              **************************/
2280
2281             if (gmx_mm_any_lt(rsq20,rcutoff2))
2282             {
2283
2284             r20              = _mm_mul_ps(rsq20,rinv20);
2285
2286             /* EWALD ELECTROSTATICS */
2287
2288             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2289             ewrt             = _mm_mul_ps(r20,ewtabscale);
2290             ewitab           = _mm_cvttps_epi32(ewrt);
2291             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2292             ewitab           = _mm_slli_epi32(ewitab,2);
2293             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2294             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2295             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2296             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2297             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2298             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2299             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2300             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2301             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2302
2303             d                = _mm_sub_ps(r20,rswitch);
2304             d                = _mm_max_ps(d,_mm_setzero_ps());
2305             d2               = _mm_mul_ps(d,d);
2306             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2307
2308             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2309
2310             /* Evaluate switch function */
2311             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2312             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2313             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2314
2315             fscal            = felec;
2316
2317             fscal            = _mm_and_ps(fscal,cutoff_mask);
2318
2319             /* Calculate temporary vectorial force */
2320             tx               = _mm_mul_ps(fscal,dx20);
2321             ty               = _mm_mul_ps(fscal,dy20);
2322             tz               = _mm_mul_ps(fscal,dz20);
2323
2324             /* Update vectorial force */
2325             fix2             = _mm_add_ps(fix2,tx);
2326             fiy2             = _mm_add_ps(fiy2,ty);
2327             fiz2             = _mm_add_ps(fiz2,tz);
2328
2329             fjx0             = _mm_add_ps(fjx0,tx);
2330             fjy0             = _mm_add_ps(fjy0,ty);
2331             fjz0             = _mm_add_ps(fjz0,tz);
2332
2333             }
2334
2335             /**************************
2336              * CALCULATE INTERACTIONS *
2337              **************************/
2338
2339             if (gmx_mm_any_lt(rsq21,rcutoff2))
2340             {
2341
2342             r21              = _mm_mul_ps(rsq21,rinv21);
2343
2344             /* EWALD ELECTROSTATICS */
2345
2346             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2347             ewrt             = _mm_mul_ps(r21,ewtabscale);
2348             ewitab           = _mm_cvttps_epi32(ewrt);
2349             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2350             ewitab           = _mm_slli_epi32(ewitab,2);
2351             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2352             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2353             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2354             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2355             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2356             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2357             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2358             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2359             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2360
2361             d                = _mm_sub_ps(r21,rswitch);
2362             d                = _mm_max_ps(d,_mm_setzero_ps());
2363             d2               = _mm_mul_ps(d,d);
2364             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2365
2366             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2367
2368             /* Evaluate switch function */
2369             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2370             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2371             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2372
2373             fscal            = felec;
2374
2375             fscal            = _mm_and_ps(fscal,cutoff_mask);
2376
2377             /* Calculate temporary vectorial force */
2378             tx               = _mm_mul_ps(fscal,dx21);
2379             ty               = _mm_mul_ps(fscal,dy21);
2380             tz               = _mm_mul_ps(fscal,dz21);
2381
2382             /* Update vectorial force */
2383             fix2             = _mm_add_ps(fix2,tx);
2384             fiy2             = _mm_add_ps(fiy2,ty);
2385             fiz2             = _mm_add_ps(fiz2,tz);
2386
2387             fjx1             = _mm_add_ps(fjx1,tx);
2388             fjy1             = _mm_add_ps(fjy1,ty);
2389             fjz1             = _mm_add_ps(fjz1,tz);
2390
2391             }
2392
2393             /**************************
2394              * CALCULATE INTERACTIONS *
2395              **************************/
2396
2397             if (gmx_mm_any_lt(rsq22,rcutoff2))
2398             {
2399
2400             r22              = _mm_mul_ps(rsq22,rinv22);
2401
2402             /* EWALD ELECTROSTATICS */
2403
2404             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2405             ewrt             = _mm_mul_ps(r22,ewtabscale);
2406             ewitab           = _mm_cvttps_epi32(ewrt);
2407             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2408             ewitab           = _mm_slli_epi32(ewitab,2);
2409             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2410             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2411             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2412             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2413             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2414             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2415             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2416             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2417             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2418
2419             d                = _mm_sub_ps(r22,rswitch);
2420             d                = _mm_max_ps(d,_mm_setzero_ps());
2421             d2               = _mm_mul_ps(d,d);
2422             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2423
2424             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2425
2426             /* Evaluate switch function */
2427             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2428             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2429             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2430
2431             fscal            = felec;
2432
2433             fscal            = _mm_and_ps(fscal,cutoff_mask);
2434
2435             /* Calculate temporary vectorial force */
2436             tx               = _mm_mul_ps(fscal,dx22);
2437             ty               = _mm_mul_ps(fscal,dy22);
2438             tz               = _mm_mul_ps(fscal,dz22);
2439
2440             /* Update vectorial force */
2441             fix2             = _mm_add_ps(fix2,tx);
2442             fiy2             = _mm_add_ps(fiy2,ty);
2443             fiz2             = _mm_add_ps(fiz2,tz);
2444
2445             fjx2             = _mm_add_ps(fjx2,tx);
2446             fjy2             = _mm_add_ps(fjy2,ty);
2447             fjz2             = _mm_add_ps(fjz2,tz);
2448
2449             }
2450
2451             fjptrA             = f+j_coord_offsetA;
2452             fjptrB             = f+j_coord_offsetB;
2453             fjptrC             = f+j_coord_offsetC;
2454             fjptrD             = f+j_coord_offsetD;
2455
2456             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2457                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2458
2459             /* Inner loop uses 573 flops */
2460         }
2461
2462         if(jidx<j_index_end)
2463         {
2464
2465             /* Get j neighbor index, and coordinate index */
2466             jnrlistA         = jjnr[jidx];
2467             jnrlistB         = jjnr[jidx+1];
2468             jnrlistC         = jjnr[jidx+2];
2469             jnrlistD         = jjnr[jidx+3];
2470             /* Sign of each element will be negative for non-real atoms.
2471              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2472              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2473              */
2474             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2475             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2476             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2477             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2478             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2479             j_coord_offsetA  = DIM*jnrA;
2480             j_coord_offsetB  = DIM*jnrB;
2481             j_coord_offsetC  = DIM*jnrC;
2482             j_coord_offsetD  = DIM*jnrD;
2483
2484             /* load j atom coordinates */
2485             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2486                                               x+j_coord_offsetC,x+j_coord_offsetD,
2487                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2488
2489             /* Calculate displacement vector */
2490             dx00             = _mm_sub_ps(ix0,jx0);
2491             dy00             = _mm_sub_ps(iy0,jy0);
2492             dz00             = _mm_sub_ps(iz0,jz0);
2493             dx01             = _mm_sub_ps(ix0,jx1);
2494             dy01             = _mm_sub_ps(iy0,jy1);
2495             dz01             = _mm_sub_ps(iz0,jz1);
2496             dx02             = _mm_sub_ps(ix0,jx2);
2497             dy02             = _mm_sub_ps(iy0,jy2);
2498             dz02             = _mm_sub_ps(iz0,jz2);
2499             dx10             = _mm_sub_ps(ix1,jx0);
2500             dy10             = _mm_sub_ps(iy1,jy0);
2501             dz10             = _mm_sub_ps(iz1,jz0);
2502             dx11             = _mm_sub_ps(ix1,jx1);
2503             dy11             = _mm_sub_ps(iy1,jy1);
2504             dz11             = _mm_sub_ps(iz1,jz1);
2505             dx12             = _mm_sub_ps(ix1,jx2);
2506             dy12             = _mm_sub_ps(iy1,jy2);
2507             dz12             = _mm_sub_ps(iz1,jz2);
2508             dx20             = _mm_sub_ps(ix2,jx0);
2509             dy20             = _mm_sub_ps(iy2,jy0);
2510             dz20             = _mm_sub_ps(iz2,jz0);
2511             dx21             = _mm_sub_ps(ix2,jx1);
2512             dy21             = _mm_sub_ps(iy2,jy1);
2513             dz21             = _mm_sub_ps(iz2,jz1);
2514             dx22             = _mm_sub_ps(ix2,jx2);
2515             dy22             = _mm_sub_ps(iy2,jy2);
2516             dz22             = _mm_sub_ps(iz2,jz2);
2517
2518             /* Calculate squared distance and things based on it */
2519             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2520             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2521             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2522             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2523             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2524             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2525             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2526             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2527             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2528
2529             rinv00           = sse41_invsqrt_f(rsq00);
2530             rinv01           = sse41_invsqrt_f(rsq01);
2531             rinv02           = sse41_invsqrt_f(rsq02);
2532             rinv10           = sse41_invsqrt_f(rsq10);
2533             rinv11           = sse41_invsqrt_f(rsq11);
2534             rinv12           = sse41_invsqrt_f(rsq12);
2535             rinv20           = sse41_invsqrt_f(rsq20);
2536             rinv21           = sse41_invsqrt_f(rsq21);
2537             rinv22           = sse41_invsqrt_f(rsq22);
2538
2539             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2540             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
2541             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
2542             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
2543             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2544             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2545             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
2546             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2547             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2548
2549             fjx0             = _mm_setzero_ps();
2550             fjy0             = _mm_setzero_ps();
2551             fjz0             = _mm_setzero_ps();
2552             fjx1             = _mm_setzero_ps();
2553             fjy1             = _mm_setzero_ps();
2554             fjz1             = _mm_setzero_ps();
2555             fjx2             = _mm_setzero_ps();
2556             fjy2             = _mm_setzero_ps();
2557             fjz2             = _mm_setzero_ps();
2558
2559             /**************************
2560              * CALCULATE INTERACTIONS *
2561              **************************/
2562
2563             if (gmx_mm_any_lt(rsq00,rcutoff2))
2564             {
2565
2566             r00              = _mm_mul_ps(rsq00,rinv00);
2567             r00              = _mm_andnot_ps(dummy_mask,r00);
2568
2569             /* EWALD ELECTROSTATICS */
2570
2571             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2572             ewrt             = _mm_mul_ps(r00,ewtabscale);
2573             ewitab           = _mm_cvttps_epi32(ewrt);
2574             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2575             ewitab           = _mm_slli_epi32(ewitab,2);
2576             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2577             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2578             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2579             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2580             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2581             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2582             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2583             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
2584             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2585
2586             /* LENNARD-JONES DISPERSION/REPULSION */
2587
2588             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2589             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
2590             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2591             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2592             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2593
2594             d                = _mm_sub_ps(r00,rswitch);
2595             d                = _mm_max_ps(d,_mm_setzero_ps());
2596             d2               = _mm_mul_ps(d,d);
2597             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2598
2599             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2600
2601             /* Evaluate switch function */
2602             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2603             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2604             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2605             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2606
2607             fscal            = _mm_add_ps(felec,fvdw);
2608
2609             fscal            = _mm_and_ps(fscal,cutoff_mask);
2610
2611             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2612
2613             /* Calculate temporary vectorial force */
2614             tx               = _mm_mul_ps(fscal,dx00);
2615             ty               = _mm_mul_ps(fscal,dy00);
2616             tz               = _mm_mul_ps(fscal,dz00);
2617
2618             /* Update vectorial force */
2619             fix0             = _mm_add_ps(fix0,tx);
2620             fiy0             = _mm_add_ps(fiy0,ty);
2621             fiz0             = _mm_add_ps(fiz0,tz);
2622
2623             fjx0             = _mm_add_ps(fjx0,tx);
2624             fjy0             = _mm_add_ps(fjy0,ty);
2625             fjz0             = _mm_add_ps(fjz0,tz);
2626
2627             }
2628
2629             /**************************
2630              * CALCULATE INTERACTIONS *
2631              **************************/
2632
2633             if (gmx_mm_any_lt(rsq01,rcutoff2))
2634             {
2635
2636             r01              = _mm_mul_ps(rsq01,rinv01);
2637             r01              = _mm_andnot_ps(dummy_mask,r01);
2638
2639             /* EWALD ELECTROSTATICS */
2640
2641             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2642             ewrt             = _mm_mul_ps(r01,ewtabscale);
2643             ewitab           = _mm_cvttps_epi32(ewrt);
2644             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2645             ewitab           = _mm_slli_epi32(ewitab,2);
2646             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2647             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2648             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2649             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2650             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2651             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2652             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2653             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
2654             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2655
2656             d                = _mm_sub_ps(r01,rswitch);
2657             d                = _mm_max_ps(d,_mm_setzero_ps());
2658             d2               = _mm_mul_ps(d,d);
2659             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2660
2661             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2662
2663             /* Evaluate switch function */
2664             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2665             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2666             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
2667
2668             fscal            = felec;
2669
2670             fscal            = _mm_and_ps(fscal,cutoff_mask);
2671
2672             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2673
2674             /* Calculate temporary vectorial force */
2675             tx               = _mm_mul_ps(fscal,dx01);
2676             ty               = _mm_mul_ps(fscal,dy01);
2677             tz               = _mm_mul_ps(fscal,dz01);
2678
2679             /* Update vectorial force */
2680             fix0             = _mm_add_ps(fix0,tx);
2681             fiy0             = _mm_add_ps(fiy0,ty);
2682             fiz0             = _mm_add_ps(fiz0,tz);
2683
2684             fjx1             = _mm_add_ps(fjx1,tx);
2685             fjy1             = _mm_add_ps(fjy1,ty);
2686             fjz1             = _mm_add_ps(fjz1,tz);
2687
2688             }
2689
2690             /**************************
2691              * CALCULATE INTERACTIONS *
2692              **************************/
2693
2694             if (gmx_mm_any_lt(rsq02,rcutoff2))
2695             {
2696
2697             r02              = _mm_mul_ps(rsq02,rinv02);
2698             r02              = _mm_andnot_ps(dummy_mask,r02);
2699
2700             /* EWALD ELECTROSTATICS */
2701
2702             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2703             ewrt             = _mm_mul_ps(r02,ewtabscale);
2704             ewitab           = _mm_cvttps_epi32(ewrt);
2705             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2706             ewitab           = _mm_slli_epi32(ewitab,2);
2707             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2708             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2709             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2710             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2711             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2712             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2713             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2714             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2715             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2716
2717             d                = _mm_sub_ps(r02,rswitch);
2718             d                = _mm_max_ps(d,_mm_setzero_ps());
2719             d2               = _mm_mul_ps(d,d);
2720             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2721
2722             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2723
2724             /* Evaluate switch function */
2725             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2726             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2727             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
2728
2729             fscal            = felec;
2730
2731             fscal            = _mm_and_ps(fscal,cutoff_mask);
2732
2733             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2734
2735             /* Calculate temporary vectorial force */
2736             tx               = _mm_mul_ps(fscal,dx02);
2737             ty               = _mm_mul_ps(fscal,dy02);
2738             tz               = _mm_mul_ps(fscal,dz02);
2739
2740             /* Update vectorial force */
2741             fix0             = _mm_add_ps(fix0,tx);
2742             fiy0             = _mm_add_ps(fiy0,ty);
2743             fiz0             = _mm_add_ps(fiz0,tz);
2744
2745             fjx2             = _mm_add_ps(fjx2,tx);
2746             fjy2             = _mm_add_ps(fjy2,ty);
2747             fjz2             = _mm_add_ps(fjz2,tz);
2748
2749             }
2750
2751             /**************************
2752              * CALCULATE INTERACTIONS *
2753              **************************/
2754
2755             if (gmx_mm_any_lt(rsq10,rcutoff2))
2756             {
2757
2758             r10              = _mm_mul_ps(rsq10,rinv10);
2759             r10              = _mm_andnot_ps(dummy_mask,r10);
2760
2761             /* EWALD ELECTROSTATICS */
2762
2763             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2764             ewrt             = _mm_mul_ps(r10,ewtabscale);
2765             ewitab           = _mm_cvttps_epi32(ewrt);
2766             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2767             ewitab           = _mm_slli_epi32(ewitab,2);
2768             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2769             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2770             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2771             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2772             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2773             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2774             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2775             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2776             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2777
2778             d                = _mm_sub_ps(r10,rswitch);
2779             d                = _mm_max_ps(d,_mm_setzero_ps());
2780             d2               = _mm_mul_ps(d,d);
2781             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2782
2783             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2784
2785             /* Evaluate switch function */
2786             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2787             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2788             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
2789
2790             fscal            = felec;
2791
2792             fscal            = _mm_and_ps(fscal,cutoff_mask);
2793
2794             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2795
2796             /* Calculate temporary vectorial force */
2797             tx               = _mm_mul_ps(fscal,dx10);
2798             ty               = _mm_mul_ps(fscal,dy10);
2799             tz               = _mm_mul_ps(fscal,dz10);
2800
2801             /* Update vectorial force */
2802             fix1             = _mm_add_ps(fix1,tx);
2803             fiy1             = _mm_add_ps(fiy1,ty);
2804             fiz1             = _mm_add_ps(fiz1,tz);
2805
2806             fjx0             = _mm_add_ps(fjx0,tx);
2807             fjy0             = _mm_add_ps(fjy0,ty);
2808             fjz0             = _mm_add_ps(fjz0,tz);
2809
2810             }
2811
2812             /**************************
2813              * CALCULATE INTERACTIONS *
2814              **************************/
2815
2816             if (gmx_mm_any_lt(rsq11,rcutoff2))
2817             {
2818
2819             r11              = _mm_mul_ps(rsq11,rinv11);
2820             r11              = _mm_andnot_ps(dummy_mask,r11);
2821
2822             /* EWALD ELECTROSTATICS */
2823
2824             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2825             ewrt             = _mm_mul_ps(r11,ewtabscale);
2826             ewitab           = _mm_cvttps_epi32(ewrt);
2827             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2828             ewitab           = _mm_slli_epi32(ewitab,2);
2829             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2830             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2831             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2832             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2833             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2834             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2835             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2836             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2837             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2838
2839             d                = _mm_sub_ps(r11,rswitch);
2840             d                = _mm_max_ps(d,_mm_setzero_ps());
2841             d2               = _mm_mul_ps(d,d);
2842             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2843
2844             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2845
2846             /* Evaluate switch function */
2847             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2848             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2849             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2850
2851             fscal            = felec;
2852
2853             fscal            = _mm_and_ps(fscal,cutoff_mask);
2854
2855             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2856
2857             /* Calculate temporary vectorial force */
2858             tx               = _mm_mul_ps(fscal,dx11);
2859             ty               = _mm_mul_ps(fscal,dy11);
2860             tz               = _mm_mul_ps(fscal,dz11);
2861
2862             /* Update vectorial force */
2863             fix1             = _mm_add_ps(fix1,tx);
2864             fiy1             = _mm_add_ps(fiy1,ty);
2865             fiz1             = _mm_add_ps(fiz1,tz);
2866
2867             fjx1             = _mm_add_ps(fjx1,tx);
2868             fjy1             = _mm_add_ps(fjy1,ty);
2869             fjz1             = _mm_add_ps(fjz1,tz);
2870
2871             }
2872
2873             /**************************
2874              * CALCULATE INTERACTIONS *
2875              **************************/
2876
2877             if (gmx_mm_any_lt(rsq12,rcutoff2))
2878             {
2879
2880             r12              = _mm_mul_ps(rsq12,rinv12);
2881             r12              = _mm_andnot_ps(dummy_mask,r12);
2882
2883             /* EWALD ELECTROSTATICS */
2884
2885             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2886             ewrt             = _mm_mul_ps(r12,ewtabscale);
2887             ewitab           = _mm_cvttps_epi32(ewrt);
2888             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2889             ewitab           = _mm_slli_epi32(ewitab,2);
2890             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2891             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2892             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2893             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2894             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2895             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2896             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2897             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2898             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2899
2900             d                = _mm_sub_ps(r12,rswitch);
2901             d                = _mm_max_ps(d,_mm_setzero_ps());
2902             d2               = _mm_mul_ps(d,d);
2903             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2904
2905             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2906
2907             /* Evaluate switch function */
2908             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2909             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2910             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2911
2912             fscal            = felec;
2913
2914             fscal            = _mm_and_ps(fscal,cutoff_mask);
2915
2916             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2917
2918             /* Calculate temporary vectorial force */
2919             tx               = _mm_mul_ps(fscal,dx12);
2920             ty               = _mm_mul_ps(fscal,dy12);
2921             tz               = _mm_mul_ps(fscal,dz12);
2922
2923             /* Update vectorial force */
2924             fix1             = _mm_add_ps(fix1,tx);
2925             fiy1             = _mm_add_ps(fiy1,ty);
2926             fiz1             = _mm_add_ps(fiz1,tz);
2927
2928             fjx2             = _mm_add_ps(fjx2,tx);
2929             fjy2             = _mm_add_ps(fjy2,ty);
2930             fjz2             = _mm_add_ps(fjz2,tz);
2931
2932             }
2933
2934             /**************************
2935              * CALCULATE INTERACTIONS *
2936              **************************/
2937
2938             if (gmx_mm_any_lt(rsq20,rcutoff2))
2939             {
2940
2941             r20              = _mm_mul_ps(rsq20,rinv20);
2942             r20              = _mm_andnot_ps(dummy_mask,r20);
2943
2944             /* EWALD ELECTROSTATICS */
2945
2946             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2947             ewrt             = _mm_mul_ps(r20,ewtabscale);
2948             ewitab           = _mm_cvttps_epi32(ewrt);
2949             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
2950             ewitab           = _mm_slli_epi32(ewitab,2);
2951             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2952             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2953             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2954             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2955             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2956             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2957             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2958             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2959             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2960
2961             d                = _mm_sub_ps(r20,rswitch);
2962             d                = _mm_max_ps(d,_mm_setzero_ps());
2963             d2               = _mm_mul_ps(d,d);
2964             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2965
2966             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2967
2968             /* Evaluate switch function */
2969             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2970             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2971             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2972
2973             fscal            = felec;
2974
2975             fscal            = _mm_and_ps(fscal,cutoff_mask);
2976
2977             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2978
2979             /* Calculate temporary vectorial force */
2980             tx               = _mm_mul_ps(fscal,dx20);
2981             ty               = _mm_mul_ps(fscal,dy20);
2982             tz               = _mm_mul_ps(fscal,dz20);
2983
2984             /* Update vectorial force */
2985             fix2             = _mm_add_ps(fix2,tx);
2986             fiy2             = _mm_add_ps(fiy2,ty);
2987             fiz2             = _mm_add_ps(fiz2,tz);
2988
2989             fjx0             = _mm_add_ps(fjx0,tx);
2990             fjy0             = _mm_add_ps(fjy0,ty);
2991             fjz0             = _mm_add_ps(fjz0,tz);
2992
2993             }
2994
2995             /**************************
2996              * CALCULATE INTERACTIONS *
2997              **************************/
2998
2999             if (gmx_mm_any_lt(rsq21,rcutoff2))
3000             {
3001
3002             r21              = _mm_mul_ps(rsq21,rinv21);
3003             r21              = _mm_andnot_ps(dummy_mask,r21);
3004
3005             /* EWALD ELECTROSTATICS */
3006
3007             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3008             ewrt             = _mm_mul_ps(r21,ewtabscale);
3009             ewitab           = _mm_cvttps_epi32(ewrt);
3010             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3011             ewitab           = _mm_slli_epi32(ewitab,2);
3012             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3013             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3014             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3015             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3016             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3017             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3018             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3019             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
3020             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
3021
3022             d                = _mm_sub_ps(r21,rswitch);
3023             d                = _mm_max_ps(d,_mm_setzero_ps());
3024             d2               = _mm_mul_ps(d,d);
3025             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3026
3027             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3028
3029             /* Evaluate switch function */
3030             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3031             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
3032             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
3033
3034             fscal            = felec;
3035
3036             fscal            = _mm_and_ps(fscal,cutoff_mask);
3037
3038             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3039
3040             /* Calculate temporary vectorial force */
3041             tx               = _mm_mul_ps(fscal,dx21);
3042             ty               = _mm_mul_ps(fscal,dy21);
3043             tz               = _mm_mul_ps(fscal,dz21);
3044
3045             /* Update vectorial force */
3046             fix2             = _mm_add_ps(fix2,tx);
3047             fiy2             = _mm_add_ps(fiy2,ty);
3048             fiz2             = _mm_add_ps(fiz2,tz);
3049
3050             fjx1             = _mm_add_ps(fjx1,tx);
3051             fjy1             = _mm_add_ps(fjy1,ty);
3052             fjz1             = _mm_add_ps(fjz1,tz);
3053
3054             }
3055
3056             /**************************
3057              * CALCULATE INTERACTIONS *
3058              **************************/
3059
3060             if (gmx_mm_any_lt(rsq22,rcutoff2))
3061             {
3062
3063             r22              = _mm_mul_ps(rsq22,rinv22);
3064             r22              = _mm_andnot_ps(dummy_mask,r22);
3065
3066             /* EWALD ELECTROSTATICS */
3067
3068             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3069             ewrt             = _mm_mul_ps(r22,ewtabscale);
3070             ewitab           = _mm_cvttps_epi32(ewrt);
3071             eweps            = _mm_sub_ps(ewrt,_mm_round_ps(ewrt, _MM_FROUND_FLOOR));
3072             ewitab           = _mm_slli_epi32(ewitab,2);
3073             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3074             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3075             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3076             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3077             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3078             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3079             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3080             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3081             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3082
3083             d                = _mm_sub_ps(r22,rswitch);
3084             d                = _mm_max_ps(d,_mm_setzero_ps());
3085             d2               = _mm_mul_ps(d,d);
3086             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3087
3088             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3089
3090             /* Evaluate switch function */
3091             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3092             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3093             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
3094
3095             fscal            = felec;
3096
3097             fscal            = _mm_and_ps(fscal,cutoff_mask);
3098
3099             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3100
3101             /* Calculate temporary vectorial force */
3102             tx               = _mm_mul_ps(fscal,dx22);
3103             ty               = _mm_mul_ps(fscal,dy22);
3104             tz               = _mm_mul_ps(fscal,dz22);
3105
3106             /* Update vectorial force */
3107             fix2             = _mm_add_ps(fix2,tx);
3108             fiy2             = _mm_add_ps(fiy2,ty);
3109             fiz2             = _mm_add_ps(fiz2,tz);
3110
3111             fjx2             = _mm_add_ps(fjx2,tx);
3112             fjy2             = _mm_add_ps(fjy2,ty);
3113             fjz2             = _mm_add_ps(fjz2,tz);
3114
3115             }
3116
3117             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3118             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3119             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3120             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3121
3122             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3123                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3124
3125             /* Inner loop uses 582 flops */
3126         }
3127
3128         /* End of innermost loop */
3129
3130         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3131                                               f+i_coord_offset,fshift+i_shift_offset);
3132
3133         /* Increment number of inner iterations */
3134         inneriter                  += j_index_end - j_index_start;
3135
3136         /* Outer loop uses 18 flops */
3137     }
3138
3139     /* Increment number of outer iterations */
3140     outeriter        += nri;
3141
3142     /* Update outer/inner flops */
3143
3144     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*582);
3145 }