Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sse2_single.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_sse2_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water4-Water4
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80     real             scratch[4*DIM];
81     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwioffset3;
89     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
90     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
93     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
95     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
97     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
105     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
106     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
107     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
108     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
109     real             *charge;
110     int              nvdwtype;
111     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112     int              *vdwtype;
113     real             *vdwparam;
114     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
115     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
116     __m128i          ewitab;
117     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
118     real             *ewtab;
119     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
120     real             rswitch_scalar,d_scalar;
121     __m128           dummy_mask,cutoff_mask;
122     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
123     __m128           one     = _mm_set1_ps(1.0);
124     __m128           two     = _mm_set1_ps(2.0);
125     x                = xx[0];
126     f                = ff[0];
127
128     nri              = nlist->nri;
129     iinr             = nlist->iinr;
130     jindex           = nlist->jindex;
131     jjnr             = nlist->jjnr;
132     shiftidx         = nlist->shift;
133     gid              = nlist->gid;
134     shiftvec         = fr->shift_vec[0];
135     fshift           = fr->fshift[0];
136     facel            = _mm_set1_ps(fr->ic->epsfac);
137     charge           = mdatoms->chargeA;
138     nvdwtype         = fr->ntype;
139     vdwparam         = fr->nbfp;
140     vdwtype          = mdatoms->typeA;
141
142     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
143     ewtab            = fr->ic->tabq_coul_FDV0;
144     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
145     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
146
147     /* Setup water-specific parameters */
148     inr              = nlist->iinr[0];
149     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
150     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
151     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
152     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
153
154     jq1              = _mm_set1_ps(charge[inr+1]);
155     jq2              = _mm_set1_ps(charge[inr+2]);
156     jq3              = _mm_set1_ps(charge[inr+3]);
157     vdwjidx0A        = 2*vdwtype[inr+0];
158     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
159     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
160     qq11             = _mm_mul_ps(iq1,jq1);
161     qq12             = _mm_mul_ps(iq1,jq2);
162     qq13             = _mm_mul_ps(iq1,jq3);
163     qq21             = _mm_mul_ps(iq2,jq1);
164     qq22             = _mm_mul_ps(iq2,jq2);
165     qq23             = _mm_mul_ps(iq2,jq3);
166     qq31             = _mm_mul_ps(iq3,jq1);
167     qq32             = _mm_mul_ps(iq3,jq2);
168     qq33             = _mm_mul_ps(iq3,jq3);
169
170     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
171     rcutoff_scalar   = fr->ic->rcoulomb;
172     rcutoff          = _mm_set1_ps(rcutoff_scalar);
173     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
174
175     rswitch_scalar   = fr->ic->rcoulomb_switch;
176     rswitch          = _mm_set1_ps(rswitch_scalar);
177     /* Setup switch parameters */
178     d_scalar         = rcutoff_scalar-rswitch_scalar;
179     d                = _mm_set1_ps(d_scalar);
180     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
181     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
182     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
183     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
184     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
185     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
186
187     /* Avoid stupid compiler warnings */
188     jnrA = jnrB = jnrC = jnrD = 0;
189     j_coord_offsetA = 0;
190     j_coord_offsetB = 0;
191     j_coord_offsetC = 0;
192     j_coord_offsetD = 0;
193
194     outeriter        = 0;
195     inneriter        = 0;
196
197     for(iidx=0;iidx<4*DIM;iidx++)
198     {
199         scratch[iidx] = 0.0;
200     }  
201
202     /* Start outer loop over neighborlists */
203     for(iidx=0; iidx<nri; iidx++)
204     {
205         /* Load shift vector for this list */
206         i_shift_offset   = DIM*shiftidx[iidx];
207
208         /* Load limits for loop over neighbors */
209         j_index_start    = jindex[iidx];
210         j_index_end      = jindex[iidx+1];
211
212         /* Get outer coordinate index */
213         inr              = iinr[iidx];
214         i_coord_offset   = DIM*inr;
215
216         /* Load i particle coords and add shift vector */
217         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
218                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
219         
220         fix0             = _mm_setzero_ps();
221         fiy0             = _mm_setzero_ps();
222         fiz0             = _mm_setzero_ps();
223         fix1             = _mm_setzero_ps();
224         fiy1             = _mm_setzero_ps();
225         fiz1             = _mm_setzero_ps();
226         fix2             = _mm_setzero_ps();
227         fiy2             = _mm_setzero_ps();
228         fiz2             = _mm_setzero_ps();
229         fix3             = _mm_setzero_ps();
230         fiy3             = _mm_setzero_ps();
231         fiz3             = _mm_setzero_ps();
232
233         /* Reset potential sums */
234         velecsum         = _mm_setzero_ps();
235         vvdwsum          = _mm_setzero_ps();
236
237         /* Start inner kernel loop */
238         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
239         {
240
241             /* Get j neighbor index, and coordinate index */
242             jnrA             = jjnr[jidx];
243             jnrB             = jjnr[jidx+1];
244             jnrC             = jjnr[jidx+2];
245             jnrD             = jjnr[jidx+3];
246             j_coord_offsetA  = DIM*jnrA;
247             j_coord_offsetB  = DIM*jnrB;
248             j_coord_offsetC  = DIM*jnrC;
249             j_coord_offsetD  = DIM*jnrD;
250
251             /* load j atom coordinates */
252             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
253                                               x+j_coord_offsetC,x+j_coord_offsetD,
254                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
255                                               &jy2,&jz2,&jx3,&jy3,&jz3);
256
257             /* Calculate displacement vector */
258             dx00             = _mm_sub_ps(ix0,jx0);
259             dy00             = _mm_sub_ps(iy0,jy0);
260             dz00             = _mm_sub_ps(iz0,jz0);
261             dx11             = _mm_sub_ps(ix1,jx1);
262             dy11             = _mm_sub_ps(iy1,jy1);
263             dz11             = _mm_sub_ps(iz1,jz1);
264             dx12             = _mm_sub_ps(ix1,jx2);
265             dy12             = _mm_sub_ps(iy1,jy2);
266             dz12             = _mm_sub_ps(iz1,jz2);
267             dx13             = _mm_sub_ps(ix1,jx3);
268             dy13             = _mm_sub_ps(iy1,jy3);
269             dz13             = _mm_sub_ps(iz1,jz3);
270             dx21             = _mm_sub_ps(ix2,jx1);
271             dy21             = _mm_sub_ps(iy2,jy1);
272             dz21             = _mm_sub_ps(iz2,jz1);
273             dx22             = _mm_sub_ps(ix2,jx2);
274             dy22             = _mm_sub_ps(iy2,jy2);
275             dz22             = _mm_sub_ps(iz2,jz2);
276             dx23             = _mm_sub_ps(ix2,jx3);
277             dy23             = _mm_sub_ps(iy2,jy3);
278             dz23             = _mm_sub_ps(iz2,jz3);
279             dx31             = _mm_sub_ps(ix3,jx1);
280             dy31             = _mm_sub_ps(iy3,jy1);
281             dz31             = _mm_sub_ps(iz3,jz1);
282             dx32             = _mm_sub_ps(ix3,jx2);
283             dy32             = _mm_sub_ps(iy3,jy2);
284             dz32             = _mm_sub_ps(iz3,jz2);
285             dx33             = _mm_sub_ps(ix3,jx3);
286             dy33             = _mm_sub_ps(iy3,jy3);
287             dz33             = _mm_sub_ps(iz3,jz3);
288
289             /* Calculate squared distance and things based on it */
290             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
291             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
292             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
293             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
294             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
295             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
296             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
297             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
298             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
299             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
300
301             rinv00           = sse2_invsqrt_f(rsq00);
302             rinv11           = sse2_invsqrt_f(rsq11);
303             rinv12           = sse2_invsqrt_f(rsq12);
304             rinv13           = sse2_invsqrt_f(rsq13);
305             rinv21           = sse2_invsqrt_f(rsq21);
306             rinv22           = sse2_invsqrt_f(rsq22);
307             rinv23           = sse2_invsqrt_f(rsq23);
308             rinv31           = sse2_invsqrt_f(rsq31);
309             rinv32           = sse2_invsqrt_f(rsq32);
310             rinv33           = sse2_invsqrt_f(rsq33);
311
312             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
313             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
314             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
315             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
316             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
317             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
318             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
319             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
320             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
321             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
322
323             fjx0             = _mm_setzero_ps();
324             fjy0             = _mm_setzero_ps();
325             fjz0             = _mm_setzero_ps();
326             fjx1             = _mm_setzero_ps();
327             fjy1             = _mm_setzero_ps();
328             fjz1             = _mm_setzero_ps();
329             fjx2             = _mm_setzero_ps();
330             fjy2             = _mm_setzero_ps();
331             fjz2             = _mm_setzero_ps();
332             fjx3             = _mm_setzero_ps();
333             fjy3             = _mm_setzero_ps();
334             fjz3             = _mm_setzero_ps();
335
336             /**************************
337              * CALCULATE INTERACTIONS *
338              **************************/
339
340             if (gmx_mm_any_lt(rsq00,rcutoff2))
341             {
342
343             r00              = _mm_mul_ps(rsq00,rinv00);
344
345             /* LENNARD-JONES DISPERSION/REPULSION */
346
347             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
348             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
349             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
350             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
351             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
352
353             d                = _mm_sub_ps(r00,rswitch);
354             d                = _mm_max_ps(d,_mm_setzero_ps());
355             d2               = _mm_mul_ps(d,d);
356             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
357
358             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
359
360             /* Evaluate switch function */
361             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
362             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
363             vvdw             = _mm_mul_ps(vvdw,sw);
364             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
365
366             /* Update potential sum for this i atom from the interaction with this j atom. */
367             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
368             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
369
370             fscal            = fvdw;
371
372             fscal            = _mm_and_ps(fscal,cutoff_mask);
373
374             /* Calculate temporary vectorial force */
375             tx               = _mm_mul_ps(fscal,dx00);
376             ty               = _mm_mul_ps(fscal,dy00);
377             tz               = _mm_mul_ps(fscal,dz00);
378
379             /* Update vectorial force */
380             fix0             = _mm_add_ps(fix0,tx);
381             fiy0             = _mm_add_ps(fiy0,ty);
382             fiz0             = _mm_add_ps(fiz0,tz);
383
384             fjx0             = _mm_add_ps(fjx0,tx);
385             fjy0             = _mm_add_ps(fjy0,ty);
386             fjz0             = _mm_add_ps(fjz0,tz);
387             
388             }
389
390             /**************************
391              * CALCULATE INTERACTIONS *
392              **************************/
393
394             if (gmx_mm_any_lt(rsq11,rcutoff2))
395             {
396
397             r11              = _mm_mul_ps(rsq11,rinv11);
398
399             /* EWALD ELECTROSTATICS */
400
401             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
402             ewrt             = _mm_mul_ps(r11,ewtabscale);
403             ewitab           = _mm_cvttps_epi32(ewrt);
404             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
405             ewitab           = _mm_slli_epi32(ewitab,2);
406             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
407             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
408             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
409             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
410             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
411             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
412             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
413             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
414             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
415
416             d                = _mm_sub_ps(r11,rswitch);
417             d                = _mm_max_ps(d,_mm_setzero_ps());
418             d2               = _mm_mul_ps(d,d);
419             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
420
421             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
422
423             /* Evaluate switch function */
424             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
425             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
426             velec            = _mm_mul_ps(velec,sw);
427             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
428
429             /* Update potential sum for this i atom from the interaction with this j atom. */
430             velec            = _mm_and_ps(velec,cutoff_mask);
431             velecsum         = _mm_add_ps(velecsum,velec);
432
433             fscal            = felec;
434
435             fscal            = _mm_and_ps(fscal,cutoff_mask);
436
437             /* Calculate temporary vectorial force */
438             tx               = _mm_mul_ps(fscal,dx11);
439             ty               = _mm_mul_ps(fscal,dy11);
440             tz               = _mm_mul_ps(fscal,dz11);
441
442             /* Update vectorial force */
443             fix1             = _mm_add_ps(fix1,tx);
444             fiy1             = _mm_add_ps(fiy1,ty);
445             fiz1             = _mm_add_ps(fiz1,tz);
446
447             fjx1             = _mm_add_ps(fjx1,tx);
448             fjy1             = _mm_add_ps(fjy1,ty);
449             fjz1             = _mm_add_ps(fjz1,tz);
450             
451             }
452
453             /**************************
454              * CALCULATE INTERACTIONS *
455              **************************/
456
457             if (gmx_mm_any_lt(rsq12,rcutoff2))
458             {
459
460             r12              = _mm_mul_ps(rsq12,rinv12);
461
462             /* EWALD ELECTROSTATICS */
463
464             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
465             ewrt             = _mm_mul_ps(r12,ewtabscale);
466             ewitab           = _mm_cvttps_epi32(ewrt);
467             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
468             ewitab           = _mm_slli_epi32(ewitab,2);
469             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
470             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
471             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
472             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
473             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
474             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
475             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
476             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
477             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
478
479             d                = _mm_sub_ps(r12,rswitch);
480             d                = _mm_max_ps(d,_mm_setzero_ps());
481             d2               = _mm_mul_ps(d,d);
482             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
483
484             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
485
486             /* Evaluate switch function */
487             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
488             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
489             velec            = _mm_mul_ps(velec,sw);
490             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
491
492             /* Update potential sum for this i atom from the interaction with this j atom. */
493             velec            = _mm_and_ps(velec,cutoff_mask);
494             velecsum         = _mm_add_ps(velecsum,velec);
495
496             fscal            = felec;
497
498             fscal            = _mm_and_ps(fscal,cutoff_mask);
499
500             /* Calculate temporary vectorial force */
501             tx               = _mm_mul_ps(fscal,dx12);
502             ty               = _mm_mul_ps(fscal,dy12);
503             tz               = _mm_mul_ps(fscal,dz12);
504
505             /* Update vectorial force */
506             fix1             = _mm_add_ps(fix1,tx);
507             fiy1             = _mm_add_ps(fiy1,ty);
508             fiz1             = _mm_add_ps(fiz1,tz);
509
510             fjx2             = _mm_add_ps(fjx2,tx);
511             fjy2             = _mm_add_ps(fjy2,ty);
512             fjz2             = _mm_add_ps(fjz2,tz);
513             
514             }
515
516             /**************************
517              * CALCULATE INTERACTIONS *
518              **************************/
519
520             if (gmx_mm_any_lt(rsq13,rcutoff2))
521             {
522
523             r13              = _mm_mul_ps(rsq13,rinv13);
524
525             /* EWALD ELECTROSTATICS */
526
527             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
528             ewrt             = _mm_mul_ps(r13,ewtabscale);
529             ewitab           = _mm_cvttps_epi32(ewrt);
530             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
531             ewitab           = _mm_slli_epi32(ewitab,2);
532             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
533             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
534             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
535             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
536             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
537             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
538             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
539             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
540             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
541
542             d                = _mm_sub_ps(r13,rswitch);
543             d                = _mm_max_ps(d,_mm_setzero_ps());
544             d2               = _mm_mul_ps(d,d);
545             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
546
547             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
548
549             /* Evaluate switch function */
550             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
551             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
552             velec            = _mm_mul_ps(velec,sw);
553             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
554
555             /* Update potential sum for this i atom from the interaction with this j atom. */
556             velec            = _mm_and_ps(velec,cutoff_mask);
557             velecsum         = _mm_add_ps(velecsum,velec);
558
559             fscal            = felec;
560
561             fscal            = _mm_and_ps(fscal,cutoff_mask);
562
563             /* Calculate temporary vectorial force */
564             tx               = _mm_mul_ps(fscal,dx13);
565             ty               = _mm_mul_ps(fscal,dy13);
566             tz               = _mm_mul_ps(fscal,dz13);
567
568             /* Update vectorial force */
569             fix1             = _mm_add_ps(fix1,tx);
570             fiy1             = _mm_add_ps(fiy1,ty);
571             fiz1             = _mm_add_ps(fiz1,tz);
572
573             fjx3             = _mm_add_ps(fjx3,tx);
574             fjy3             = _mm_add_ps(fjy3,ty);
575             fjz3             = _mm_add_ps(fjz3,tz);
576             
577             }
578
579             /**************************
580              * CALCULATE INTERACTIONS *
581              **************************/
582
583             if (gmx_mm_any_lt(rsq21,rcutoff2))
584             {
585
586             r21              = _mm_mul_ps(rsq21,rinv21);
587
588             /* EWALD ELECTROSTATICS */
589
590             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
591             ewrt             = _mm_mul_ps(r21,ewtabscale);
592             ewitab           = _mm_cvttps_epi32(ewrt);
593             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
594             ewitab           = _mm_slli_epi32(ewitab,2);
595             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
596             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
597             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
598             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
599             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
600             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
601             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
602             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
603             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
604
605             d                = _mm_sub_ps(r21,rswitch);
606             d                = _mm_max_ps(d,_mm_setzero_ps());
607             d2               = _mm_mul_ps(d,d);
608             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
609
610             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
611
612             /* Evaluate switch function */
613             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
614             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
615             velec            = _mm_mul_ps(velec,sw);
616             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
617
618             /* Update potential sum for this i atom from the interaction with this j atom. */
619             velec            = _mm_and_ps(velec,cutoff_mask);
620             velecsum         = _mm_add_ps(velecsum,velec);
621
622             fscal            = felec;
623
624             fscal            = _mm_and_ps(fscal,cutoff_mask);
625
626             /* Calculate temporary vectorial force */
627             tx               = _mm_mul_ps(fscal,dx21);
628             ty               = _mm_mul_ps(fscal,dy21);
629             tz               = _mm_mul_ps(fscal,dz21);
630
631             /* Update vectorial force */
632             fix2             = _mm_add_ps(fix2,tx);
633             fiy2             = _mm_add_ps(fiy2,ty);
634             fiz2             = _mm_add_ps(fiz2,tz);
635
636             fjx1             = _mm_add_ps(fjx1,tx);
637             fjy1             = _mm_add_ps(fjy1,ty);
638             fjz1             = _mm_add_ps(fjz1,tz);
639             
640             }
641
642             /**************************
643              * CALCULATE INTERACTIONS *
644              **************************/
645
646             if (gmx_mm_any_lt(rsq22,rcutoff2))
647             {
648
649             r22              = _mm_mul_ps(rsq22,rinv22);
650
651             /* EWALD ELECTROSTATICS */
652
653             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
654             ewrt             = _mm_mul_ps(r22,ewtabscale);
655             ewitab           = _mm_cvttps_epi32(ewrt);
656             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
657             ewitab           = _mm_slli_epi32(ewitab,2);
658             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
659             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
660             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
661             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
662             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
663             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
664             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
665             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
666             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
667
668             d                = _mm_sub_ps(r22,rswitch);
669             d                = _mm_max_ps(d,_mm_setzero_ps());
670             d2               = _mm_mul_ps(d,d);
671             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
672
673             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
674
675             /* Evaluate switch function */
676             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
677             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
678             velec            = _mm_mul_ps(velec,sw);
679             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
680
681             /* Update potential sum for this i atom from the interaction with this j atom. */
682             velec            = _mm_and_ps(velec,cutoff_mask);
683             velecsum         = _mm_add_ps(velecsum,velec);
684
685             fscal            = felec;
686
687             fscal            = _mm_and_ps(fscal,cutoff_mask);
688
689             /* Calculate temporary vectorial force */
690             tx               = _mm_mul_ps(fscal,dx22);
691             ty               = _mm_mul_ps(fscal,dy22);
692             tz               = _mm_mul_ps(fscal,dz22);
693
694             /* Update vectorial force */
695             fix2             = _mm_add_ps(fix2,tx);
696             fiy2             = _mm_add_ps(fiy2,ty);
697             fiz2             = _mm_add_ps(fiz2,tz);
698
699             fjx2             = _mm_add_ps(fjx2,tx);
700             fjy2             = _mm_add_ps(fjy2,ty);
701             fjz2             = _mm_add_ps(fjz2,tz);
702             
703             }
704
705             /**************************
706              * CALCULATE INTERACTIONS *
707              **************************/
708
709             if (gmx_mm_any_lt(rsq23,rcutoff2))
710             {
711
712             r23              = _mm_mul_ps(rsq23,rinv23);
713
714             /* EWALD ELECTROSTATICS */
715
716             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
717             ewrt             = _mm_mul_ps(r23,ewtabscale);
718             ewitab           = _mm_cvttps_epi32(ewrt);
719             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
720             ewitab           = _mm_slli_epi32(ewitab,2);
721             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
722             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
723             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
724             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
725             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
726             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
727             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
728             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
729             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
730
731             d                = _mm_sub_ps(r23,rswitch);
732             d                = _mm_max_ps(d,_mm_setzero_ps());
733             d2               = _mm_mul_ps(d,d);
734             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
735
736             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
737
738             /* Evaluate switch function */
739             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
740             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
741             velec            = _mm_mul_ps(velec,sw);
742             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
743
744             /* Update potential sum for this i atom from the interaction with this j atom. */
745             velec            = _mm_and_ps(velec,cutoff_mask);
746             velecsum         = _mm_add_ps(velecsum,velec);
747
748             fscal            = felec;
749
750             fscal            = _mm_and_ps(fscal,cutoff_mask);
751
752             /* Calculate temporary vectorial force */
753             tx               = _mm_mul_ps(fscal,dx23);
754             ty               = _mm_mul_ps(fscal,dy23);
755             tz               = _mm_mul_ps(fscal,dz23);
756
757             /* Update vectorial force */
758             fix2             = _mm_add_ps(fix2,tx);
759             fiy2             = _mm_add_ps(fiy2,ty);
760             fiz2             = _mm_add_ps(fiz2,tz);
761
762             fjx3             = _mm_add_ps(fjx3,tx);
763             fjy3             = _mm_add_ps(fjy3,ty);
764             fjz3             = _mm_add_ps(fjz3,tz);
765             
766             }
767
768             /**************************
769              * CALCULATE INTERACTIONS *
770              **************************/
771
772             if (gmx_mm_any_lt(rsq31,rcutoff2))
773             {
774
775             r31              = _mm_mul_ps(rsq31,rinv31);
776
777             /* EWALD ELECTROSTATICS */
778
779             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
780             ewrt             = _mm_mul_ps(r31,ewtabscale);
781             ewitab           = _mm_cvttps_epi32(ewrt);
782             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
783             ewitab           = _mm_slli_epi32(ewitab,2);
784             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
785             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
786             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
787             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
788             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
789             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
790             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
791             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
792             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
793
794             d                = _mm_sub_ps(r31,rswitch);
795             d                = _mm_max_ps(d,_mm_setzero_ps());
796             d2               = _mm_mul_ps(d,d);
797             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
798
799             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
800
801             /* Evaluate switch function */
802             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
803             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
804             velec            = _mm_mul_ps(velec,sw);
805             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
806
807             /* Update potential sum for this i atom from the interaction with this j atom. */
808             velec            = _mm_and_ps(velec,cutoff_mask);
809             velecsum         = _mm_add_ps(velecsum,velec);
810
811             fscal            = felec;
812
813             fscal            = _mm_and_ps(fscal,cutoff_mask);
814
815             /* Calculate temporary vectorial force */
816             tx               = _mm_mul_ps(fscal,dx31);
817             ty               = _mm_mul_ps(fscal,dy31);
818             tz               = _mm_mul_ps(fscal,dz31);
819
820             /* Update vectorial force */
821             fix3             = _mm_add_ps(fix3,tx);
822             fiy3             = _mm_add_ps(fiy3,ty);
823             fiz3             = _mm_add_ps(fiz3,tz);
824
825             fjx1             = _mm_add_ps(fjx1,tx);
826             fjy1             = _mm_add_ps(fjy1,ty);
827             fjz1             = _mm_add_ps(fjz1,tz);
828             
829             }
830
831             /**************************
832              * CALCULATE INTERACTIONS *
833              **************************/
834
835             if (gmx_mm_any_lt(rsq32,rcutoff2))
836             {
837
838             r32              = _mm_mul_ps(rsq32,rinv32);
839
840             /* EWALD ELECTROSTATICS */
841
842             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
843             ewrt             = _mm_mul_ps(r32,ewtabscale);
844             ewitab           = _mm_cvttps_epi32(ewrt);
845             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
846             ewitab           = _mm_slli_epi32(ewitab,2);
847             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
848             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
849             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
850             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
851             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
852             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
853             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
854             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
855             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
856
857             d                = _mm_sub_ps(r32,rswitch);
858             d                = _mm_max_ps(d,_mm_setzero_ps());
859             d2               = _mm_mul_ps(d,d);
860             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
861
862             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
863
864             /* Evaluate switch function */
865             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
866             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
867             velec            = _mm_mul_ps(velec,sw);
868             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
869
870             /* Update potential sum for this i atom from the interaction with this j atom. */
871             velec            = _mm_and_ps(velec,cutoff_mask);
872             velecsum         = _mm_add_ps(velecsum,velec);
873
874             fscal            = felec;
875
876             fscal            = _mm_and_ps(fscal,cutoff_mask);
877
878             /* Calculate temporary vectorial force */
879             tx               = _mm_mul_ps(fscal,dx32);
880             ty               = _mm_mul_ps(fscal,dy32);
881             tz               = _mm_mul_ps(fscal,dz32);
882
883             /* Update vectorial force */
884             fix3             = _mm_add_ps(fix3,tx);
885             fiy3             = _mm_add_ps(fiy3,ty);
886             fiz3             = _mm_add_ps(fiz3,tz);
887
888             fjx2             = _mm_add_ps(fjx2,tx);
889             fjy2             = _mm_add_ps(fjy2,ty);
890             fjz2             = _mm_add_ps(fjz2,tz);
891             
892             }
893
894             /**************************
895              * CALCULATE INTERACTIONS *
896              **************************/
897
898             if (gmx_mm_any_lt(rsq33,rcutoff2))
899             {
900
901             r33              = _mm_mul_ps(rsq33,rinv33);
902
903             /* EWALD ELECTROSTATICS */
904
905             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
906             ewrt             = _mm_mul_ps(r33,ewtabscale);
907             ewitab           = _mm_cvttps_epi32(ewrt);
908             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
909             ewitab           = _mm_slli_epi32(ewitab,2);
910             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
911             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
912             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
913             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
914             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
915             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
916             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
917             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
918             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
919
920             d                = _mm_sub_ps(r33,rswitch);
921             d                = _mm_max_ps(d,_mm_setzero_ps());
922             d2               = _mm_mul_ps(d,d);
923             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
924
925             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
926
927             /* Evaluate switch function */
928             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
929             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
930             velec            = _mm_mul_ps(velec,sw);
931             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
932
933             /* Update potential sum for this i atom from the interaction with this j atom. */
934             velec            = _mm_and_ps(velec,cutoff_mask);
935             velecsum         = _mm_add_ps(velecsum,velec);
936
937             fscal            = felec;
938
939             fscal            = _mm_and_ps(fscal,cutoff_mask);
940
941             /* Calculate temporary vectorial force */
942             tx               = _mm_mul_ps(fscal,dx33);
943             ty               = _mm_mul_ps(fscal,dy33);
944             tz               = _mm_mul_ps(fscal,dz33);
945
946             /* Update vectorial force */
947             fix3             = _mm_add_ps(fix3,tx);
948             fiy3             = _mm_add_ps(fiy3,ty);
949             fiz3             = _mm_add_ps(fiz3,tz);
950
951             fjx3             = _mm_add_ps(fjx3,tx);
952             fjy3             = _mm_add_ps(fjy3,ty);
953             fjz3             = _mm_add_ps(fjz3,tz);
954             
955             }
956
957             fjptrA             = f+j_coord_offsetA;
958             fjptrB             = f+j_coord_offsetB;
959             fjptrC             = f+j_coord_offsetC;
960             fjptrD             = f+j_coord_offsetD;
961
962             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
963                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
964                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
965
966             /* Inner loop uses 647 flops */
967         }
968
969         if(jidx<j_index_end)
970         {
971
972             /* Get j neighbor index, and coordinate index */
973             jnrlistA         = jjnr[jidx];
974             jnrlistB         = jjnr[jidx+1];
975             jnrlistC         = jjnr[jidx+2];
976             jnrlistD         = jjnr[jidx+3];
977             /* Sign of each element will be negative for non-real atoms.
978              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
979              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
980              */
981             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
982             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
983             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
984             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
985             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
986             j_coord_offsetA  = DIM*jnrA;
987             j_coord_offsetB  = DIM*jnrB;
988             j_coord_offsetC  = DIM*jnrC;
989             j_coord_offsetD  = DIM*jnrD;
990
991             /* load j atom coordinates */
992             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
993                                               x+j_coord_offsetC,x+j_coord_offsetD,
994                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
995                                               &jy2,&jz2,&jx3,&jy3,&jz3);
996
997             /* Calculate displacement vector */
998             dx00             = _mm_sub_ps(ix0,jx0);
999             dy00             = _mm_sub_ps(iy0,jy0);
1000             dz00             = _mm_sub_ps(iz0,jz0);
1001             dx11             = _mm_sub_ps(ix1,jx1);
1002             dy11             = _mm_sub_ps(iy1,jy1);
1003             dz11             = _mm_sub_ps(iz1,jz1);
1004             dx12             = _mm_sub_ps(ix1,jx2);
1005             dy12             = _mm_sub_ps(iy1,jy2);
1006             dz12             = _mm_sub_ps(iz1,jz2);
1007             dx13             = _mm_sub_ps(ix1,jx3);
1008             dy13             = _mm_sub_ps(iy1,jy3);
1009             dz13             = _mm_sub_ps(iz1,jz3);
1010             dx21             = _mm_sub_ps(ix2,jx1);
1011             dy21             = _mm_sub_ps(iy2,jy1);
1012             dz21             = _mm_sub_ps(iz2,jz1);
1013             dx22             = _mm_sub_ps(ix2,jx2);
1014             dy22             = _mm_sub_ps(iy2,jy2);
1015             dz22             = _mm_sub_ps(iz2,jz2);
1016             dx23             = _mm_sub_ps(ix2,jx3);
1017             dy23             = _mm_sub_ps(iy2,jy3);
1018             dz23             = _mm_sub_ps(iz2,jz3);
1019             dx31             = _mm_sub_ps(ix3,jx1);
1020             dy31             = _mm_sub_ps(iy3,jy1);
1021             dz31             = _mm_sub_ps(iz3,jz1);
1022             dx32             = _mm_sub_ps(ix3,jx2);
1023             dy32             = _mm_sub_ps(iy3,jy2);
1024             dz32             = _mm_sub_ps(iz3,jz2);
1025             dx33             = _mm_sub_ps(ix3,jx3);
1026             dy33             = _mm_sub_ps(iy3,jy3);
1027             dz33             = _mm_sub_ps(iz3,jz3);
1028
1029             /* Calculate squared distance and things based on it */
1030             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1031             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1032             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1033             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1034             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1035             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1036             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1037             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1038             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1039             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1040
1041             rinv00           = sse2_invsqrt_f(rsq00);
1042             rinv11           = sse2_invsqrt_f(rsq11);
1043             rinv12           = sse2_invsqrt_f(rsq12);
1044             rinv13           = sse2_invsqrt_f(rsq13);
1045             rinv21           = sse2_invsqrt_f(rsq21);
1046             rinv22           = sse2_invsqrt_f(rsq22);
1047             rinv23           = sse2_invsqrt_f(rsq23);
1048             rinv31           = sse2_invsqrt_f(rsq31);
1049             rinv32           = sse2_invsqrt_f(rsq32);
1050             rinv33           = sse2_invsqrt_f(rsq33);
1051
1052             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1053             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1054             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1055             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1056             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1057             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1058             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1059             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1060             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1061             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1062
1063             fjx0             = _mm_setzero_ps();
1064             fjy0             = _mm_setzero_ps();
1065             fjz0             = _mm_setzero_ps();
1066             fjx1             = _mm_setzero_ps();
1067             fjy1             = _mm_setzero_ps();
1068             fjz1             = _mm_setzero_ps();
1069             fjx2             = _mm_setzero_ps();
1070             fjy2             = _mm_setzero_ps();
1071             fjz2             = _mm_setzero_ps();
1072             fjx3             = _mm_setzero_ps();
1073             fjy3             = _mm_setzero_ps();
1074             fjz3             = _mm_setzero_ps();
1075
1076             /**************************
1077              * CALCULATE INTERACTIONS *
1078              **************************/
1079
1080             if (gmx_mm_any_lt(rsq00,rcutoff2))
1081             {
1082
1083             r00              = _mm_mul_ps(rsq00,rinv00);
1084             r00              = _mm_andnot_ps(dummy_mask,r00);
1085
1086             /* LENNARD-JONES DISPERSION/REPULSION */
1087
1088             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1089             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1090             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1091             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1092             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1093
1094             d                = _mm_sub_ps(r00,rswitch);
1095             d                = _mm_max_ps(d,_mm_setzero_ps());
1096             d2               = _mm_mul_ps(d,d);
1097             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1098
1099             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1100
1101             /* Evaluate switch function */
1102             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1103             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1104             vvdw             = _mm_mul_ps(vvdw,sw);
1105             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1106
1107             /* Update potential sum for this i atom from the interaction with this j atom. */
1108             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
1109             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
1110             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
1111
1112             fscal            = fvdw;
1113
1114             fscal            = _mm_and_ps(fscal,cutoff_mask);
1115
1116             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1117
1118             /* Calculate temporary vectorial force */
1119             tx               = _mm_mul_ps(fscal,dx00);
1120             ty               = _mm_mul_ps(fscal,dy00);
1121             tz               = _mm_mul_ps(fscal,dz00);
1122
1123             /* Update vectorial force */
1124             fix0             = _mm_add_ps(fix0,tx);
1125             fiy0             = _mm_add_ps(fiy0,ty);
1126             fiz0             = _mm_add_ps(fiz0,tz);
1127
1128             fjx0             = _mm_add_ps(fjx0,tx);
1129             fjy0             = _mm_add_ps(fjy0,ty);
1130             fjz0             = _mm_add_ps(fjz0,tz);
1131             
1132             }
1133
1134             /**************************
1135              * CALCULATE INTERACTIONS *
1136              **************************/
1137
1138             if (gmx_mm_any_lt(rsq11,rcutoff2))
1139             {
1140
1141             r11              = _mm_mul_ps(rsq11,rinv11);
1142             r11              = _mm_andnot_ps(dummy_mask,r11);
1143
1144             /* EWALD ELECTROSTATICS */
1145
1146             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1147             ewrt             = _mm_mul_ps(r11,ewtabscale);
1148             ewitab           = _mm_cvttps_epi32(ewrt);
1149             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1150             ewitab           = _mm_slli_epi32(ewitab,2);
1151             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1152             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1153             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1154             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1155             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1156             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1157             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1158             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1159             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1160
1161             d                = _mm_sub_ps(r11,rswitch);
1162             d                = _mm_max_ps(d,_mm_setzero_ps());
1163             d2               = _mm_mul_ps(d,d);
1164             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1165
1166             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1167
1168             /* Evaluate switch function */
1169             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1170             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1171             velec            = _mm_mul_ps(velec,sw);
1172             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1173
1174             /* Update potential sum for this i atom from the interaction with this j atom. */
1175             velec            = _mm_and_ps(velec,cutoff_mask);
1176             velec            = _mm_andnot_ps(dummy_mask,velec);
1177             velecsum         = _mm_add_ps(velecsum,velec);
1178
1179             fscal            = felec;
1180
1181             fscal            = _mm_and_ps(fscal,cutoff_mask);
1182
1183             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1184
1185             /* Calculate temporary vectorial force */
1186             tx               = _mm_mul_ps(fscal,dx11);
1187             ty               = _mm_mul_ps(fscal,dy11);
1188             tz               = _mm_mul_ps(fscal,dz11);
1189
1190             /* Update vectorial force */
1191             fix1             = _mm_add_ps(fix1,tx);
1192             fiy1             = _mm_add_ps(fiy1,ty);
1193             fiz1             = _mm_add_ps(fiz1,tz);
1194
1195             fjx1             = _mm_add_ps(fjx1,tx);
1196             fjy1             = _mm_add_ps(fjy1,ty);
1197             fjz1             = _mm_add_ps(fjz1,tz);
1198             
1199             }
1200
1201             /**************************
1202              * CALCULATE INTERACTIONS *
1203              **************************/
1204
1205             if (gmx_mm_any_lt(rsq12,rcutoff2))
1206             {
1207
1208             r12              = _mm_mul_ps(rsq12,rinv12);
1209             r12              = _mm_andnot_ps(dummy_mask,r12);
1210
1211             /* EWALD ELECTROSTATICS */
1212
1213             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1214             ewrt             = _mm_mul_ps(r12,ewtabscale);
1215             ewitab           = _mm_cvttps_epi32(ewrt);
1216             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1217             ewitab           = _mm_slli_epi32(ewitab,2);
1218             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1219             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1220             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1221             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1222             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1223             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1224             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1225             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1226             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1227
1228             d                = _mm_sub_ps(r12,rswitch);
1229             d                = _mm_max_ps(d,_mm_setzero_ps());
1230             d2               = _mm_mul_ps(d,d);
1231             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1232
1233             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1234
1235             /* Evaluate switch function */
1236             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1237             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1238             velec            = _mm_mul_ps(velec,sw);
1239             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1240
1241             /* Update potential sum for this i atom from the interaction with this j atom. */
1242             velec            = _mm_and_ps(velec,cutoff_mask);
1243             velec            = _mm_andnot_ps(dummy_mask,velec);
1244             velecsum         = _mm_add_ps(velecsum,velec);
1245
1246             fscal            = felec;
1247
1248             fscal            = _mm_and_ps(fscal,cutoff_mask);
1249
1250             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1251
1252             /* Calculate temporary vectorial force */
1253             tx               = _mm_mul_ps(fscal,dx12);
1254             ty               = _mm_mul_ps(fscal,dy12);
1255             tz               = _mm_mul_ps(fscal,dz12);
1256
1257             /* Update vectorial force */
1258             fix1             = _mm_add_ps(fix1,tx);
1259             fiy1             = _mm_add_ps(fiy1,ty);
1260             fiz1             = _mm_add_ps(fiz1,tz);
1261
1262             fjx2             = _mm_add_ps(fjx2,tx);
1263             fjy2             = _mm_add_ps(fjy2,ty);
1264             fjz2             = _mm_add_ps(fjz2,tz);
1265             
1266             }
1267
1268             /**************************
1269              * CALCULATE INTERACTIONS *
1270              **************************/
1271
1272             if (gmx_mm_any_lt(rsq13,rcutoff2))
1273             {
1274
1275             r13              = _mm_mul_ps(rsq13,rinv13);
1276             r13              = _mm_andnot_ps(dummy_mask,r13);
1277
1278             /* EWALD ELECTROSTATICS */
1279
1280             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1281             ewrt             = _mm_mul_ps(r13,ewtabscale);
1282             ewitab           = _mm_cvttps_epi32(ewrt);
1283             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1284             ewitab           = _mm_slli_epi32(ewitab,2);
1285             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1286             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1287             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1288             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1289             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1290             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1291             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1292             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1293             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1294
1295             d                = _mm_sub_ps(r13,rswitch);
1296             d                = _mm_max_ps(d,_mm_setzero_ps());
1297             d2               = _mm_mul_ps(d,d);
1298             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1299
1300             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1301
1302             /* Evaluate switch function */
1303             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1304             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1305             velec            = _mm_mul_ps(velec,sw);
1306             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1307
1308             /* Update potential sum for this i atom from the interaction with this j atom. */
1309             velec            = _mm_and_ps(velec,cutoff_mask);
1310             velec            = _mm_andnot_ps(dummy_mask,velec);
1311             velecsum         = _mm_add_ps(velecsum,velec);
1312
1313             fscal            = felec;
1314
1315             fscal            = _mm_and_ps(fscal,cutoff_mask);
1316
1317             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1318
1319             /* Calculate temporary vectorial force */
1320             tx               = _mm_mul_ps(fscal,dx13);
1321             ty               = _mm_mul_ps(fscal,dy13);
1322             tz               = _mm_mul_ps(fscal,dz13);
1323
1324             /* Update vectorial force */
1325             fix1             = _mm_add_ps(fix1,tx);
1326             fiy1             = _mm_add_ps(fiy1,ty);
1327             fiz1             = _mm_add_ps(fiz1,tz);
1328
1329             fjx3             = _mm_add_ps(fjx3,tx);
1330             fjy3             = _mm_add_ps(fjy3,ty);
1331             fjz3             = _mm_add_ps(fjz3,tz);
1332             
1333             }
1334
1335             /**************************
1336              * CALCULATE INTERACTIONS *
1337              **************************/
1338
1339             if (gmx_mm_any_lt(rsq21,rcutoff2))
1340             {
1341
1342             r21              = _mm_mul_ps(rsq21,rinv21);
1343             r21              = _mm_andnot_ps(dummy_mask,r21);
1344
1345             /* EWALD ELECTROSTATICS */
1346
1347             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1348             ewrt             = _mm_mul_ps(r21,ewtabscale);
1349             ewitab           = _mm_cvttps_epi32(ewrt);
1350             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1351             ewitab           = _mm_slli_epi32(ewitab,2);
1352             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1353             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1354             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1355             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1356             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1357             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1358             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1359             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1360             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1361
1362             d                = _mm_sub_ps(r21,rswitch);
1363             d                = _mm_max_ps(d,_mm_setzero_ps());
1364             d2               = _mm_mul_ps(d,d);
1365             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1366
1367             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1368
1369             /* Evaluate switch function */
1370             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1371             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1372             velec            = _mm_mul_ps(velec,sw);
1373             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1374
1375             /* Update potential sum for this i atom from the interaction with this j atom. */
1376             velec            = _mm_and_ps(velec,cutoff_mask);
1377             velec            = _mm_andnot_ps(dummy_mask,velec);
1378             velecsum         = _mm_add_ps(velecsum,velec);
1379
1380             fscal            = felec;
1381
1382             fscal            = _mm_and_ps(fscal,cutoff_mask);
1383
1384             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1385
1386             /* Calculate temporary vectorial force */
1387             tx               = _mm_mul_ps(fscal,dx21);
1388             ty               = _mm_mul_ps(fscal,dy21);
1389             tz               = _mm_mul_ps(fscal,dz21);
1390
1391             /* Update vectorial force */
1392             fix2             = _mm_add_ps(fix2,tx);
1393             fiy2             = _mm_add_ps(fiy2,ty);
1394             fiz2             = _mm_add_ps(fiz2,tz);
1395
1396             fjx1             = _mm_add_ps(fjx1,tx);
1397             fjy1             = _mm_add_ps(fjy1,ty);
1398             fjz1             = _mm_add_ps(fjz1,tz);
1399             
1400             }
1401
1402             /**************************
1403              * CALCULATE INTERACTIONS *
1404              **************************/
1405
1406             if (gmx_mm_any_lt(rsq22,rcutoff2))
1407             {
1408
1409             r22              = _mm_mul_ps(rsq22,rinv22);
1410             r22              = _mm_andnot_ps(dummy_mask,r22);
1411
1412             /* EWALD ELECTROSTATICS */
1413
1414             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1415             ewrt             = _mm_mul_ps(r22,ewtabscale);
1416             ewitab           = _mm_cvttps_epi32(ewrt);
1417             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1418             ewitab           = _mm_slli_epi32(ewitab,2);
1419             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1420             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1421             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1422             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1423             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1424             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1425             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1426             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1427             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1428
1429             d                = _mm_sub_ps(r22,rswitch);
1430             d                = _mm_max_ps(d,_mm_setzero_ps());
1431             d2               = _mm_mul_ps(d,d);
1432             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1433
1434             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1435
1436             /* Evaluate switch function */
1437             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1438             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1439             velec            = _mm_mul_ps(velec,sw);
1440             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1441
1442             /* Update potential sum for this i atom from the interaction with this j atom. */
1443             velec            = _mm_and_ps(velec,cutoff_mask);
1444             velec            = _mm_andnot_ps(dummy_mask,velec);
1445             velecsum         = _mm_add_ps(velecsum,velec);
1446
1447             fscal            = felec;
1448
1449             fscal            = _mm_and_ps(fscal,cutoff_mask);
1450
1451             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1452
1453             /* Calculate temporary vectorial force */
1454             tx               = _mm_mul_ps(fscal,dx22);
1455             ty               = _mm_mul_ps(fscal,dy22);
1456             tz               = _mm_mul_ps(fscal,dz22);
1457
1458             /* Update vectorial force */
1459             fix2             = _mm_add_ps(fix2,tx);
1460             fiy2             = _mm_add_ps(fiy2,ty);
1461             fiz2             = _mm_add_ps(fiz2,tz);
1462
1463             fjx2             = _mm_add_ps(fjx2,tx);
1464             fjy2             = _mm_add_ps(fjy2,ty);
1465             fjz2             = _mm_add_ps(fjz2,tz);
1466             
1467             }
1468
1469             /**************************
1470              * CALCULATE INTERACTIONS *
1471              **************************/
1472
1473             if (gmx_mm_any_lt(rsq23,rcutoff2))
1474             {
1475
1476             r23              = _mm_mul_ps(rsq23,rinv23);
1477             r23              = _mm_andnot_ps(dummy_mask,r23);
1478
1479             /* EWALD ELECTROSTATICS */
1480
1481             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1482             ewrt             = _mm_mul_ps(r23,ewtabscale);
1483             ewitab           = _mm_cvttps_epi32(ewrt);
1484             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1485             ewitab           = _mm_slli_epi32(ewitab,2);
1486             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1487             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1488             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1489             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1490             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1491             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1492             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1493             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1494             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1495
1496             d                = _mm_sub_ps(r23,rswitch);
1497             d                = _mm_max_ps(d,_mm_setzero_ps());
1498             d2               = _mm_mul_ps(d,d);
1499             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1500
1501             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1502
1503             /* Evaluate switch function */
1504             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1505             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1506             velec            = _mm_mul_ps(velec,sw);
1507             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1508
1509             /* Update potential sum for this i atom from the interaction with this j atom. */
1510             velec            = _mm_and_ps(velec,cutoff_mask);
1511             velec            = _mm_andnot_ps(dummy_mask,velec);
1512             velecsum         = _mm_add_ps(velecsum,velec);
1513
1514             fscal            = felec;
1515
1516             fscal            = _mm_and_ps(fscal,cutoff_mask);
1517
1518             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1519
1520             /* Calculate temporary vectorial force */
1521             tx               = _mm_mul_ps(fscal,dx23);
1522             ty               = _mm_mul_ps(fscal,dy23);
1523             tz               = _mm_mul_ps(fscal,dz23);
1524
1525             /* Update vectorial force */
1526             fix2             = _mm_add_ps(fix2,tx);
1527             fiy2             = _mm_add_ps(fiy2,ty);
1528             fiz2             = _mm_add_ps(fiz2,tz);
1529
1530             fjx3             = _mm_add_ps(fjx3,tx);
1531             fjy3             = _mm_add_ps(fjy3,ty);
1532             fjz3             = _mm_add_ps(fjz3,tz);
1533             
1534             }
1535
1536             /**************************
1537              * CALCULATE INTERACTIONS *
1538              **************************/
1539
1540             if (gmx_mm_any_lt(rsq31,rcutoff2))
1541             {
1542
1543             r31              = _mm_mul_ps(rsq31,rinv31);
1544             r31              = _mm_andnot_ps(dummy_mask,r31);
1545
1546             /* EWALD ELECTROSTATICS */
1547
1548             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1549             ewrt             = _mm_mul_ps(r31,ewtabscale);
1550             ewitab           = _mm_cvttps_epi32(ewrt);
1551             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1552             ewitab           = _mm_slli_epi32(ewitab,2);
1553             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1554             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1555             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1556             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1557             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1558             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1559             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1560             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1561             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1562
1563             d                = _mm_sub_ps(r31,rswitch);
1564             d                = _mm_max_ps(d,_mm_setzero_ps());
1565             d2               = _mm_mul_ps(d,d);
1566             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1567
1568             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1569
1570             /* Evaluate switch function */
1571             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1572             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1573             velec            = _mm_mul_ps(velec,sw);
1574             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1575
1576             /* Update potential sum for this i atom from the interaction with this j atom. */
1577             velec            = _mm_and_ps(velec,cutoff_mask);
1578             velec            = _mm_andnot_ps(dummy_mask,velec);
1579             velecsum         = _mm_add_ps(velecsum,velec);
1580
1581             fscal            = felec;
1582
1583             fscal            = _mm_and_ps(fscal,cutoff_mask);
1584
1585             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1586
1587             /* Calculate temporary vectorial force */
1588             tx               = _mm_mul_ps(fscal,dx31);
1589             ty               = _mm_mul_ps(fscal,dy31);
1590             tz               = _mm_mul_ps(fscal,dz31);
1591
1592             /* Update vectorial force */
1593             fix3             = _mm_add_ps(fix3,tx);
1594             fiy3             = _mm_add_ps(fiy3,ty);
1595             fiz3             = _mm_add_ps(fiz3,tz);
1596
1597             fjx1             = _mm_add_ps(fjx1,tx);
1598             fjy1             = _mm_add_ps(fjy1,ty);
1599             fjz1             = _mm_add_ps(fjz1,tz);
1600             
1601             }
1602
1603             /**************************
1604              * CALCULATE INTERACTIONS *
1605              **************************/
1606
1607             if (gmx_mm_any_lt(rsq32,rcutoff2))
1608             {
1609
1610             r32              = _mm_mul_ps(rsq32,rinv32);
1611             r32              = _mm_andnot_ps(dummy_mask,r32);
1612
1613             /* EWALD ELECTROSTATICS */
1614
1615             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1616             ewrt             = _mm_mul_ps(r32,ewtabscale);
1617             ewitab           = _mm_cvttps_epi32(ewrt);
1618             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1619             ewitab           = _mm_slli_epi32(ewitab,2);
1620             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1621             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1622             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1623             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1624             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1625             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1626             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1627             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1628             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1629
1630             d                = _mm_sub_ps(r32,rswitch);
1631             d                = _mm_max_ps(d,_mm_setzero_ps());
1632             d2               = _mm_mul_ps(d,d);
1633             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1634
1635             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1636
1637             /* Evaluate switch function */
1638             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1639             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1640             velec            = _mm_mul_ps(velec,sw);
1641             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1642
1643             /* Update potential sum for this i atom from the interaction with this j atom. */
1644             velec            = _mm_and_ps(velec,cutoff_mask);
1645             velec            = _mm_andnot_ps(dummy_mask,velec);
1646             velecsum         = _mm_add_ps(velecsum,velec);
1647
1648             fscal            = felec;
1649
1650             fscal            = _mm_and_ps(fscal,cutoff_mask);
1651
1652             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1653
1654             /* Calculate temporary vectorial force */
1655             tx               = _mm_mul_ps(fscal,dx32);
1656             ty               = _mm_mul_ps(fscal,dy32);
1657             tz               = _mm_mul_ps(fscal,dz32);
1658
1659             /* Update vectorial force */
1660             fix3             = _mm_add_ps(fix3,tx);
1661             fiy3             = _mm_add_ps(fiy3,ty);
1662             fiz3             = _mm_add_ps(fiz3,tz);
1663
1664             fjx2             = _mm_add_ps(fjx2,tx);
1665             fjy2             = _mm_add_ps(fjy2,ty);
1666             fjz2             = _mm_add_ps(fjz2,tz);
1667             
1668             }
1669
1670             /**************************
1671              * CALCULATE INTERACTIONS *
1672              **************************/
1673
1674             if (gmx_mm_any_lt(rsq33,rcutoff2))
1675             {
1676
1677             r33              = _mm_mul_ps(rsq33,rinv33);
1678             r33              = _mm_andnot_ps(dummy_mask,r33);
1679
1680             /* EWALD ELECTROSTATICS */
1681
1682             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1683             ewrt             = _mm_mul_ps(r33,ewtabscale);
1684             ewitab           = _mm_cvttps_epi32(ewrt);
1685             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1686             ewitab           = _mm_slli_epi32(ewitab,2);
1687             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1688             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1689             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1690             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1691             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1692             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1693             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1694             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1695             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1696
1697             d                = _mm_sub_ps(r33,rswitch);
1698             d                = _mm_max_ps(d,_mm_setzero_ps());
1699             d2               = _mm_mul_ps(d,d);
1700             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1701
1702             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1703
1704             /* Evaluate switch function */
1705             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1706             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1707             velec            = _mm_mul_ps(velec,sw);
1708             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1709
1710             /* Update potential sum for this i atom from the interaction with this j atom. */
1711             velec            = _mm_and_ps(velec,cutoff_mask);
1712             velec            = _mm_andnot_ps(dummy_mask,velec);
1713             velecsum         = _mm_add_ps(velecsum,velec);
1714
1715             fscal            = felec;
1716
1717             fscal            = _mm_and_ps(fscal,cutoff_mask);
1718
1719             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1720
1721             /* Calculate temporary vectorial force */
1722             tx               = _mm_mul_ps(fscal,dx33);
1723             ty               = _mm_mul_ps(fscal,dy33);
1724             tz               = _mm_mul_ps(fscal,dz33);
1725
1726             /* Update vectorial force */
1727             fix3             = _mm_add_ps(fix3,tx);
1728             fiy3             = _mm_add_ps(fiy3,ty);
1729             fiz3             = _mm_add_ps(fiz3,tz);
1730
1731             fjx3             = _mm_add_ps(fjx3,tx);
1732             fjy3             = _mm_add_ps(fjy3,ty);
1733             fjz3             = _mm_add_ps(fjz3,tz);
1734             
1735             }
1736
1737             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1738             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1739             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1740             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1741
1742             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1743                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1744                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1745
1746             /* Inner loop uses 657 flops */
1747         }
1748
1749         /* End of innermost loop */
1750
1751         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1752                                               f+i_coord_offset,fshift+i_shift_offset);
1753
1754         ggid                        = gid[iidx];
1755         /* Update potential energies */
1756         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1757         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1758
1759         /* Increment number of inner iterations */
1760         inneriter                  += j_index_end - j_index_start;
1761
1762         /* Outer loop uses 26 flops */
1763     }
1764
1765     /* Increment number of outer iterations */
1766     outeriter        += nri;
1767
1768     /* Update outer/inner flops */
1769
1770     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*657);
1771 }
1772 /*
1773  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single
1774  * Electrostatics interaction: Ewald
1775  * VdW interaction:            LennardJones
1776  * Geometry:                   Water4-Water4
1777  * Calculate force/pot:        Force
1778  */
1779 void
1780 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single
1781                     (t_nblist                    * gmx_restrict       nlist,
1782                      rvec                        * gmx_restrict          xx,
1783                      rvec                        * gmx_restrict          ff,
1784                      struct t_forcerec           * gmx_restrict          fr,
1785                      t_mdatoms                   * gmx_restrict     mdatoms,
1786                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1787                      t_nrnb                      * gmx_restrict        nrnb)
1788 {
1789     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1790      * just 0 for non-waters.
1791      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1792      * jnr indices corresponding to data put in the four positions in the SIMD register.
1793      */
1794     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1795     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1796     int              jnrA,jnrB,jnrC,jnrD;
1797     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1798     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1799     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1800     real             rcutoff_scalar;
1801     real             *shiftvec,*fshift,*x,*f;
1802     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1803     real             scratch[4*DIM];
1804     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1805     int              vdwioffset0;
1806     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1807     int              vdwioffset1;
1808     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1809     int              vdwioffset2;
1810     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1811     int              vdwioffset3;
1812     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1813     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1814     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1815     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1816     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1817     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1818     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1819     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1820     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1821     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1822     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1823     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1824     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1825     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1826     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1827     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1828     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1829     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1830     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1831     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1832     real             *charge;
1833     int              nvdwtype;
1834     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1835     int              *vdwtype;
1836     real             *vdwparam;
1837     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1838     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1839     __m128i          ewitab;
1840     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1841     real             *ewtab;
1842     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1843     real             rswitch_scalar,d_scalar;
1844     __m128           dummy_mask,cutoff_mask;
1845     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1846     __m128           one     = _mm_set1_ps(1.0);
1847     __m128           two     = _mm_set1_ps(2.0);
1848     x                = xx[0];
1849     f                = ff[0];
1850
1851     nri              = nlist->nri;
1852     iinr             = nlist->iinr;
1853     jindex           = nlist->jindex;
1854     jjnr             = nlist->jjnr;
1855     shiftidx         = nlist->shift;
1856     gid              = nlist->gid;
1857     shiftvec         = fr->shift_vec[0];
1858     fshift           = fr->fshift[0];
1859     facel            = _mm_set1_ps(fr->ic->epsfac);
1860     charge           = mdatoms->chargeA;
1861     nvdwtype         = fr->ntype;
1862     vdwparam         = fr->nbfp;
1863     vdwtype          = mdatoms->typeA;
1864
1865     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1866     ewtab            = fr->ic->tabq_coul_FDV0;
1867     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1868     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1869
1870     /* Setup water-specific parameters */
1871     inr              = nlist->iinr[0];
1872     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1873     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1874     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1875     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1876
1877     jq1              = _mm_set1_ps(charge[inr+1]);
1878     jq2              = _mm_set1_ps(charge[inr+2]);
1879     jq3              = _mm_set1_ps(charge[inr+3]);
1880     vdwjidx0A        = 2*vdwtype[inr+0];
1881     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1882     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1883     qq11             = _mm_mul_ps(iq1,jq1);
1884     qq12             = _mm_mul_ps(iq1,jq2);
1885     qq13             = _mm_mul_ps(iq1,jq3);
1886     qq21             = _mm_mul_ps(iq2,jq1);
1887     qq22             = _mm_mul_ps(iq2,jq2);
1888     qq23             = _mm_mul_ps(iq2,jq3);
1889     qq31             = _mm_mul_ps(iq3,jq1);
1890     qq32             = _mm_mul_ps(iq3,jq2);
1891     qq33             = _mm_mul_ps(iq3,jq3);
1892
1893     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1894     rcutoff_scalar   = fr->ic->rcoulomb;
1895     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1896     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1897
1898     rswitch_scalar   = fr->ic->rcoulomb_switch;
1899     rswitch          = _mm_set1_ps(rswitch_scalar);
1900     /* Setup switch parameters */
1901     d_scalar         = rcutoff_scalar-rswitch_scalar;
1902     d                = _mm_set1_ps(d_scalar);
1903     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1904     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1905     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1906     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1907     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1908     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1909
1910     /* Avoid stupid compiler warnings */
1911     jnrA = jnrB = jnrC = jnrD = 0;
1912     j_coord_offsetA = 0;
1913     j_coord_offsetB = 0;
1914     j_coord_offsetC = 0;
1915     j_coord_offsetD = 0;
1916
1917     outeriter        = 0;
1918     inneriter        = 0;
1919
1920     for(iidx=0;iidx<4*DIM;iidx++)
1921     {
1922         scratch[iidx] = 0.0;
1923     }  
1924
1925     /* Start outer loop over neighborlists */
1926     for(iidx=0; iidx<nri; iidx++)
1927     {
1928         /* Load shift vector for this list */
1929         i_shift_offset   = DIM*shiftidx[iidx];
1930
1931         /* Load limits for loop over neighbors */
1932         j_index_start    = jindex[iidx];
1933         j_index_end      = jindex[iidx+1];
1934
1935         /* Get outer coordinate index */
1936         inr              = iinr[iidx];
1937         i_coord_offset   = DIM*inr;
1938
1939         /* Load i particle coords and add shift vector */
1940         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1941                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1942         
1943         fix0             = _mm_setzero_ps();
1944         fiy0             = _mm_setzero_ps();
1945         fiz0             = _mm_setzero_ps();
1946         fix1             = _mm_setzero_ps();
1947         fiy1             = _mm_setzero_ps();
1948         fiz1             = _mm_setzero_ps();
1949         fix2             = _mm_setzero_ps();
1950         fiy2             = _mm_setzero_ps();
1951         fiz2             = _mm_setzero_ps();
1952         fix3             = _mm_setzero_ps();
1953         fiy3             = _mm_setzero_ps();
1954         fiz3             = _mm_setzero_ps();
1955
1956         /* Start inner kernel loop */
1957         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1958         {
1959
1960             /* Get j neighbor index, and coordinate index */
1961             jnrA             = jjnr[jidx];
1962             jnrB             = jjnr[jidx+1];
1963             jnrC             = jjnr[jidx+2];
1964             jnrD             = jjnr[jidx+3];
1965             j_coord_offsetA  = DIM*jnrA;
1966             j_coord_offsetB  = DIM*jnrB;
1967             j_coord_offsetC  = DIM*jnrC;
1968             j_coord_offsetD  = DIM*jnrD;
1969
1970             /* load j atom coordinates */
1971             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1972                                               x+j_coord_offsetC,x+j_coord_offsetD,
1973                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1974                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1975
1976             /* Calculate displacement vector */
1977             dx00             = _mm_sub_ps(ix0,jx0);
1978             dy00             = _mm_sub_ps(iy0,jy0);
1979             dz00             = _mm_sub_ps(iz0,jz0);
1980             dx11             = _mm_sub_ps(ix1,jx1);
1981             dy11             = _mm_sub_ps(iy1,jy1);
1982             dz11             = _mm_sub_ps(iz1,jz1);
1983             dx12             = _mm_sub_ps(ix1,jx2);
1984             dy12             = _mm_sub_ps(iy1,jy2);
1985             dz12             = _mm_sub_ps(iz1,jz2);
1986             dx13             = _mm_sub_ps(ix1,jx3);
1987             dy13             = _mm_sub_ps(iy1,jy3);
1988             dz13             = _mm_sub_ps(iz1,jz3);
1989             dx21             = _mm_sub_ps(ix2,jx1);
1990             dy21             = _mm_sub_ps(iy2,jy1);
1991             dz21             = _mm_sub_ps(iz2,jz1);
1992             dx22             = _mm_sub_ps(ix2,jx2);
1993             dy22             = _mm_sub_ps(iy2,jy2);
1994             dz22             = _mm_sub_ps(iz2,jz2);
1995             dx23             = _mm_sub_ps(ix2,jx3);
1996             dy23             = _mm_sub_ps(iy2,jy3);
1997             dz23             = _mm_sub_ps(iz2,jz3);
1998             dx31             = _mm_sub_ps(ix3,jx1);
1999             dy31             = _mm_sub_ps(iy3,jy1);
2000             dz31             = _mm_sub_ps(iz3,jz1);
2001             dx32             = _mm_sub_ps(ix3,jx2);
2002             dy32             = _mm_sub_ps(iy3,jy2);
2003             dz32             = _mm_sub_ps(iz3,jz2);
2004             dx33             = _mm_sub_ps(ix3,jx3);
2005             dy33             = _mm_sub_ps(iy3,jy3);
2006             dz33             = _mm_sub_ps(iz3,jz3);
2007
2008             /* Calculate squared distance and things based on it */
2009             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2010             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2011             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2012             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2013             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2014             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2015             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2016             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2017             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2018             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2019
2020             rinv00           = sse2_invsqrt_f(rsq00);
2021             rinv11           = sse2_invsqrt_f(rsq11);
2022             rinv12           = sse2_invsqrt_f(rsq12);
2023             rinv13           = sse2_invsqrt_f(rsq13);
2024             rinv21           = sse2_invsqrt_f(rsq21);
2025             rinv22           = sse2_invsqrt_f(rsq22);
2026             rinv23           = sse2_invsqrt_f(rsq23);
2027             rinv31           = sse2_invsqrt_f(rsq31);
2028             rinv32           = sse2_invsqrt_f(rsq32);
2029             rinv33           = sse2_invsqrt_f(rsq33);
2030
2031             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2032             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2033             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2034             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2035             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2036             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2037             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2038             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2039             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2040             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2041
2042             fjx0             = _mm_setzero_ps();
2043             fjy0             = _mm_setzero_ps();
2044             fjz0             = _mm_setzero_ps();
2045             fjx1             = _mm_setzero_ps();
2046             fjy1             = _mm_setzero_ps();
2047             fjz1             = _mm_setzero_ps();
2048             fjx2             = _mm_setzero_ps();
2049             fjy2             = _mm_setzero_ps();
2050             fjz2             = _mm_setzero_ps();
2051             fjx3             = _mm_setzero_ps();
2052             fjy3             = _mm_setzero_ps();
2053             fjz3             = _mm_setzero_ps();
2054
2055             /**************************
2056              * CALCULATE INTERACTIONS *
2057              **************************/
2058
2059             if (gmx_mm_any_lt(rsq00,rcutoff2))
2060             {
2061
2062             r00              = _mm_mul_ps(rsq00,rinv00);
2063
2064             /* LENNARD-JONES DISPERSION/REPULSION */
2065
2066             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2067             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
2068             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2069             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2070             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2071
2072             d                = _mm_sub_ps(r00,rswitch);
2073             d                = _mm_max_ps(d,_mm_setzero_ps());
2074             d2               = _mm_mul_ps(d,d);
2075             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2076
2077             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2078
2079             /* Evaluate switch function */
2080             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2081             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2082             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2083
2084             fscal            = fvdw;
2085
2086             fscal            = _mm_and_ps(fscal,cutoff_mask);
2087
2088             /* Calculate temporary vectorial force */
2089             tx               = _mm_mul_ps(fscal,dx00);
2090             ty               = _mm_mul_ps(fscal,dy00);
2091             tz               = _mm_mul_ps(fscal,dz00);
2092
2093             /* Update vectorial force */
2094             fix0             = _mm_add_ps(fix0,tx);
2095             fiy0             = _mm_add_ps(fiy0,ty);
2096             fiz0             = _mm_add_ps(fiz0,tz);
2097
2098             fjx0             = _mm_add_ps(fjx0,tx);
2099             fjy0             = _mm_add_ps(fjy0,ty);
2100             fjz0             = _mm_add_ps(fjz0,tz);
2101             
2102             }
2103
2104             /**************************
2105              * CALCULATE INTERACTIONS *
2106              **************************/
2107
2108             if (gmx_mm_any_lt(rsq11,rcutoff2))
2109             {
2110
2111             r11              = _mm_mul_ps(rsq11,rinv11);
2112
2113             /* EWALD ELECTROSTATICS */
2114
2115             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2116             ewrt             = _mm_mul_ps(r11,ewtabscale);
2117             ewitab           = _mm_cvttps_epi32(ewrt);
2118             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2119             ewitab           = _mm_slli_epi32(ewitab,2);
2120             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2121             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2122             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2123             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2124             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2125             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2126             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2127             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2128             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2129
2130             d                = _mm_sub_ps(r11,rswitch);
2131             d                = _mm_max_ps(d,_mm_setzero_ps());
2132             d2               = _mm_mul_ps(d,d);
2133             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2134
2135             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2136
2137             /* Evaluate switch function */
2138             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2139             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2140             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2141
2142             fscal            = felec;
2143
2144             fscal            = _mm_and_ps(fscal,cutoff_mask);
2145
2146             /* Calculate temporary vectorial force */
2147             tx               = _mm_mul_ps(fscal,dx11);
2148             ty               = _mm_mul_ps(fscal,dy11);
2149             tz               = _mm_mul_ps(fscal,dz11);
2150
2151             /* Update vectorial force */
2152             fix1             = _mm_add_ps(fix1,tx);
2153             fiy1             = _mm_add_ps(fiy1,ty);
2154             fiz1             = _mm_add_ps(fiz1,tz);
2155
2156             fjx1             = _mm_add_ps(fjx1,tx);
2157             fjy1             = _mm_add_ps(fjy1,ty);
2158             fjz1             = _mm_add_ps(fjz1,tz);
2159             
2160             }
2161
2162             /**************************
2163              * CALCULATE INTERACTIONS *
2164              **************************/
2165
2166             if (gmx_mm_any_lt(rsq12,rcutoff2))
2167             {
2168
2169             r12              = _mm_mul_ps(rsq12,rinv12);
2170
2171             /* EWALD ELECTROSTATICS */
2172
2173             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2174             ewrt             = _mm_mul_ps(r12,ewtabscale);
2175             ewitab           = _mm_cvttps_epi32(ewrt);
2176             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2177             ewitab           = _mm_slli_epi32(ewitab,2);
2178             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2179             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2180             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2181             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2182             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2183             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2184             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2185             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2186             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2187
2188             d                = _mm_sub_ps(r12,rswitch);
2189             d                = _mm_max_ps(d,_mm_setzero_ps());
2190             d2               = _mm_mul_ps(d,d);
2191             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2192
2193             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2194
2195             /* Evaluate switch function */
2196             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2197             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2198             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2199
2200             fscal            = felec;
2201
2202             fscal            = _mm_and_ps(fscal,cutoff_mask);
2203
2204             /* Calculate temporary vectorial force */
2205             tx               = _mm_mul_ps(fscal,dx12);
2206             ty               = _mm_mul_ps(fscal,dy12);
2207             tz               = _mm_mul_ps(fscal,dz12);
2208
2209             /* Update vectorial force */
2210             fix1             = _mm_add_ps(fix1,tx);
2211             fiy1             = _mm_add_ps(fiy1,ty);
2212             fiz1             = _mm_add_ps(fiz1,tz);
2213
2214             fjx2             = _mm_add_ps(fjx2,tx);
2215             fjy2             = _mm_add_ps(fjy2,ty);
2216             fjz2             = _mm_add_ps(fjz2,tz);
2217             
2218             }
2219
2220             /**************************
2221              * CALCULATE INTERACTIONS *
2222              **************************/
2223
2224             if (gmx_mm_any_lt(rsq13,rcutoff2))
2225             {
2226
2227             r13              = _mm_mul_ps(rsq13,rinv13);
2228
2229             /* EWALD ELECTROSTATICS */
2230
2231             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2232             ewrt             = _mm_mul_ps(r13,ewtabscale);
2233             ewitab           = _mm_cvttps_epi32(ewrt);
2234             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2235             ewitab           = _mm_slli_epi32(ewitab,2);
2236             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2237             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2238             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2239             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2240             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2241             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2242             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2243             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2244             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2245
2246             d                = _mm_sub_ps(r13,rswitch);
2247             d                = _mm_max_ps(d,_mm_setzero_ps());
2248             d2               = _mm_mul_ps(d,d);
2249             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2250
2251             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2252
2253             /* Evaluate switch function */
2254             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2255             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2256             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2257
2258             fscal            = felec;
2259
2260             fscal            = _mm_and_ps(fscal,cutoff_mask);
2261
2262             /* Calculate temporary vectorial force */
2263             tx               = _mm_mul_ps(fscal,dx13);
2264             ty               = _mm_mul_ps(fscal,dy13);
2265             tz               = _mm_mul_ps(fscal,dz13);
2266
2267             /* Update vectorial force */
2268             fix1             = _mm_add_ps(fix1,tx);
2269             fiy1             = _mm_add_ps(fiy1,ty);
2270             fiz1             = _mm_add_ps(fiz1,tz);
2271
2272             fjx3             = _mm_add_ps(fjx3,tx);
2273             fjy3             = _mm_add_ps(fjy3,ty);
2274             fjz3             = _mm_add_ps(fjz3,tz);
2275             
2276             }
2277
2278             /**************************
2279              * CALCULATE INTERACTIONS *
2280              **************************/
2281
2282             if (gmx_mm_any_lt(rsq21,rcutoff2))
2283             {
2284
2285             r21              = _mm_mul_ps(rsq21,rinv21);
2286
2287             /* EWALD ELECTROSTATICS */
2288
2289             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2290             ewrt             = _mm_mul_ps(r21,ewtabscale);
2291             ewitab           = _mm_cvttps_epi32(ewrt);
2292             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2293             ewitab           = _mm_slli_epi32(ewitab,2);
2294             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2295             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2296             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2297             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2298             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2299             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2300             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2301             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2302             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2303
2304             d                = _mm_sub_ps(r21,rswitch);
2305             d                = _mm_max_ps(d,_mm_setzero_ps());
2306             d2               = _mm_mul_ps(d,d);
2307             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2308
2309             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2310
2311             /* Evaluate switch function */
2312             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2313             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2314             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2315
2316             fscal            = felec;
2317
2318             fscal            = _mm_and_ps(fscal,cutoff_mask);
2319
2320             /* Calculate temporary vectorial force */
2321             tx               = _mm_mul_ps(fscal,dx21);
2322             ty               = _mm_mul_ps(fscal,dy21);
2323             tz               = _mm_mul_ps(fscal,dz21);
2324
2325             /* Update vectorial force */
2326             fix2             = _mm_add_ps(fix2,tx);
2327             fiy2             = _mm_add_ps(fiy2,ty);
2328             fiz2             = _mm_add_ps(fiz2,tz);
2329
2330             fjx1             = _mm_add_ps(fjx1,tx);
2331             fjy1             = _mm_add_ps(fjy1,ty);
2332             fjz1             = _mm_add_ps(fjz1,tz);
2333             
2334             }
2335
2336             /**************************
2337              * CALCULATE INTERACTIONS *
2338              **************************/
2339
2340             if (gmx_mm_any_lt(rsq22,rcutoff2))
2341             {
2342
2343             r22              = _mm_mul_ps(rsq22,rinv22);
2344
2345             /* EWALD ELECTROSTATICS */
2346
2347             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2348             ewrt             = _mm_mul_ps(r22,ewtabscale);
2349             ewitab           = _mm_cvttps_epi32(ewrt);
2350             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2351             ewitab           = _mm_slli_epi32(ewitab,2);
2352             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2353             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2354             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2355             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2356             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2357             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2358             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2359             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2360             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2361
2362             d                = _mm_sub_ps(r22,rswitch);
2363             d                = _mm_max_ps(d,_mm_setzero_ps());
2364             d2               = _mm_mul_ps(d,d);
2365             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2366
2367             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2368
2369             /* Evaluate switch function */
2370             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2371             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2372             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2373
2374             fscal            = felec;
2375
2376             fscal            = _mm_and_ps(fscal,cutoff_mask);
2377
2378             /* Calculate temporary vectorial force */
2379             tx               = _mm_mul_ps(fscal,dx22);
2380             ty               = _mm_mul_ps(fscal,dy22);
2381             tz               = _mm_mul_ps(fscal,dz22);
2382
2383             /* Update vectorial force */
2384             fix2             = _mm_add_ps(fix2,tx);
2385             fiy2             = _mm_add_ps(fiy2,ty);
2386             fiz2             = _mm_add_ps(fiz2,tz);
2387
2388             fjx2             = _mm_add_ps(fjx2,tx);
2389             fjy2             = _mm_add_ps(fjy2,ty);
2390             fjz2             = _mm_add_ps(fjz2,tz);
2391             
2392             }
2393
2394             /**************************
2395              * CALCULATE INTERACTIONS *
2396              **************************/
2397
2398             if (gmx_mm_any_lt(rsq23,rcutoff2))
2399             {
2400
2401             r23              = _mm_mul_ps(rsq23,rinv23);
2402
2403             /* EWALD ELECTROSTATICS */
2404
2405             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2406             ewrt             = _mm_mul_ps(r23,ewtabscale);
2407             ewitab           = _mm_cvttps_epi32(ewrt);
2408             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2409             ewitab           = _mm_slli_epi32(ewitab,2);
2410             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2411             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2412             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2413             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2414             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2415             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2416             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2417             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2418             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2419
2420             d                = _mm_sub_ps(r23,rswitch);
2421             d                = _mm_max_ps(d,_mm_setzero_ps());
2422             d2               = _mm_mul_ps(d,d);
2423             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2424
2425             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2426
2427             /* Evaluate switch function */
2428             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2429             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2430             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2431
2432             fscal            = felec;
2433
2434             fscal            = _mm_and_ps(fscal,cutoff_mask);
2435
2436             /* Calculate temporary vectorial force */
2437             tx               = _mm_mul_ps(fscal,dx23);
2438             ty               = _mm_mul_ps(fscal,dy23);
2439             tz               = _mm_mul_ps(fscal,dz23);
2440
2441             /* Update vectorial force */
2442             fix2             = _mm_add_ps(fix2,tx);
2443             fiy2             = _mm_add_ps(fiy2,ty);
2444             fiz2             = _mm_add_ps(fiz2,tz);
2445
2446             fjx3             = _mm_add_ps(fjx3,tx);
2447             fjy3             = _mm_add_ps(fjy3,ty);
2448             fjz3             = _mm_add_ps(fjz3,tz);
2449             
2450             }
2451
2452             /**************************
2453              * CALCULATE INTERACTIONS *
2454              **************************/
2455
2456             if (gmx_mm_any_lt(rsq31,rcutoff2))
2457             {
2458
2459             r31              = _mm_mul_ps(rsq31,rinv31);
2460
2461             /* EWALD ELECTROSTATICS */
2462
2463             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2464             ewrt             = _mm_mul_ps(r31,ewtabscale);
2465             ewitab           = _mm_cvttps_epi32(ewrt);
2466             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2467             ewitab           = _mm_slli_epi32(ewitab,2);
2468             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2469             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2470             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2471             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2472             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2473             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2474             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2475             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2476             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2477
2478             d                = _mm_sub_ps(r31,rswitch);
2479             d                = _mm_max_ps(d,_mm_setzero_ps());
2480             d2               = _mm_mul_ps(d,d);
2481             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2482
2483             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2484
2485             /* Evaluate switch function */
2486             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2487             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2488             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2489
2490             fscal            = felec;
2491
2492             fscal            = _mm_and_ps(fscal,cutoff_mask);
2493
2494             /* Calculate temporary vectorial force */
2495             tx               = _mm_mul_ps(fscal,dx31);
2496             ty               = _mm_mul_ps(fscal,dy31);
2497             tz               = _mm_mul_ps(fscal,dz31);
2498
2499             /* Update vectorial force */
2500             fix3             = _mm_add_ps(fix3,tx);
2501             fiy3             = _mm_add_ps(fiy3,ty);
2502             fiz3             = _mm_add_ps(fiz3,tz);
2503
2504             fjx1             = _mm_add_ps(fjx1,tx);
2505             fjy1             = _mm_add_ps(fjy1,ty);
2506             fjz1             = _mm_add_ps(fjz1,tz);
2507             
2508             }
2509
2510             /**************************
2511              * CALCULATE INTERACTIONS *
2512              **************************/
2513
2514             if (gmx_mm_any_lt(rsq32,rcutoff2))
2515             {
2516
2517             r32              = _mm_mul_ps(rsq32,rinv32);
2518
2519             /* EWALD ELECTROSTATICS */
2520
2521             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2522             ewrt             = _mm_mul_ps(r32,ewtabscale);
2523             ewitab           = _mm_cvttps_epi32(ewrt);
2524             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2525             ewitab           = _mm_slli_epi32(ewitab,2);
2526             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2527             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2528             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2529             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2530             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2531             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2532             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2533             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2534             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2535
2536             d                = _mm_sub_ps(r32,rswitch);
2537             d                = _mm_max_ps(d,_mm_setzero_ps());
2538             d2               = _mm_mul_ps(d,d);
2539             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2540
2541             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2542
2543             /* Evaluate switch function */
2544             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2545             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2546             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2547
2548             fscal            = felec;
2549
2550             fscal            = _mm_and_ps(fscal,cutoff_mask);
2551
2552             /* Calculate temporary vectorial force */
2553             tx               = _mm_mul_ps(fscal,dx32);
2554             ty               = _mm_mul_ps(fscal,dy32);
2555             tz               = _mm_mul_ps(fscal,dz32);
2556
2557             /* Update vectorial force */
2558             fix3             = _mm_add_ps(fix3,tx);
2559             fiy3             = _mm_add_ps(fiy3,ty);
2560             fiz3             = _mm_add_ps(fiz3,tz);
2561
2562             fjx2             = _mm_add_ps(fjx2,tx);
2563             fjy2             = _mm_add_ps(fjy2,ty);
2564             fjz2             = _mm_add_ps(fjz2,tz);
2565             
2566             }
2567
2568             /**************************
2569              * CALCULATE INTERACTIONS *
2570              **************************/
2571
2572             if (gmx_mm_any_lt(rsq33,rcutoff2))
2573             {
2574
2575             r33              = _mm_mul_ps(rsq33,rinv33);
2576
2577             /* EWALD ELECTROSTATICS */
2578
2579             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2580             ewrt             = _mm_mul_ps(r33,ewtabscale);
2581             ewitab           = _mm_cvttps_epi32(ewrt);
2582             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2583             ewitab           = _mm_slli_epi32(ewitab,2);
2584             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2585             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2586             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2587             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2588             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2589             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2590             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2591             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
2592             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2593
2594             d                = _mm_sub_ps(r33,rswitch);
2595             d                = _mm_max_ps(d,_mm_setzero_ps());
2596             d2               = _mm_mul_ps(d,d);
2597             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2598
2599             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2600
2601             /* Evaluate switch function */
2602             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2603             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2604             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2605
2606             fscal            = felec;
2607
2608             fscal            = _mm_and_ps(fscal,cutoff_mask);
2609
2610             /* Calculate temporary vectorial force */
2611             tx               = _mm_mul_ps(fscal,dx33);
2612             ty               = _mm_mul_ps(fscal,dy33);
2613             tz               = _mm_mul_ps(fscal,dz33);
2614
2615             /* Update vectorial force */
2616             fix3             = _mm_add_ps(fix3,tx);
2617             fiy3             = _mm_add_ps(fiy3,ty);
2618             fiz3             = _mm_add_ps(fiz3,tz);
2619
2620             fjx3             = _mm_add_ps(fjx3,tx);
2621             fjy3             = _mm_add_ps(fjy3,ty);
2622             fjz3             = _mm_add_ps(fjz3,tz);
2623             
2624             }
2625
2626             fjptrA             = f+j_coord_offsetA;
2627             fjptrB             = f+j_coord_offsetB;
2628             fjptrC             = f+j_coord_offsetC;
2629             fjptrD             = f+j_coord_offsetD;
2630
2631             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2632                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2633                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2634
2635             /* Inner loop uses 617 flops */
2636         }
2637
2638         if(jidx<j_index_end)
2639         {
2640
2641             /* Get j neighbor index, and coordinate index */
2642             jnrlistA         = jjnr[jidx];
2643             jnrlistB         = jjnr[jidx+1];
2644             jnrlistC         = jjnr[jidx+2];
2645             jnrlistD         = jjnr[jidx+3];
2646             /* Sign of each element will be negative for non-real atoms.
2647              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2648              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2649              */
2650             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2651             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2652             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2653             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2654             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2655             j_coord_offsetA  = DIM*jnrA;
2656             j_coord_offsetB  = DIM*jnrB;
2657             j_coord_offsetC  = DIM*jnrC;
2658             j_coord_offsetD  = DIM*jnrD;
2659
2660             /* load j atom coordinates */
2661             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2662                                               x+j_coord_offsetC,x+j_coord_offsetD,
2663                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2664                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2665
2666             /* Calculate displacement vector */
2667             dx00             = _mm_sub_ps(ix0,jx0);
2668             dy00             = _mm_sub_ps(iy0,jy0);
2669             dz00             = _mm_sub_ps(iz0,jz0);
2670             dx11             = _mm_sub_ps(ix1,jx1);
2671             dy11             = _mm_sub_ps(iy1,jy1);
2672             dz11             = _mm_sub_ps(iz1,jz1);
2673             dx12             = _mm_sub_ps(ix1,jx2);
2674             dy12             = _mm_sub_ps(iy1,jy2);
2675             dz12             = _mm_sub_ps(iz1,jz2);
2676             dx13             = _mm_sub_ps(ix1,jx3);
2677             dy13             = _mm_sub_ps(iy1,jy3);
2678             dz13             = _mm_sub_ps(iz1,jz3);
2679             dx21             = _mm_sub_ps(ix2,jx1);
2680             dy21             = _mm_sub_ps(iy2,jy1);
2681             dz21             = _mm_sub_ps(iz2,jz1);
2682             dx22             = _mm_sub_ps(ix2,jx2);
2683             dy22             = _mm_sub_ps(iy2,jy2);
2684             dz22             = _mm_sub_ps(iz2,jz2);
2685             dx23             = _mm_sub_ps(ix2,jx3);
2686             dy23             = _mm_sub_ps(iy2,jy3);
2687             dz23             = _mm_sub_ps(iz2,jz3);
2688             dx31             = _mm_sub_ps(ix3,jx1);
2689             dy31             = _mm_sub_ps(iy3,jy1);
2690             dz31             = _mm_sub_ps(iz3,jz1);
2691             dx32             = _mm_sub_ps(ix3,jx2);
2692             dy32             = _mm_sub_ps(iy3,jy2);
2693             dz32             = _mm_sub_ps(iz3,jz2);
2694             dx33             = _mm_sub_ps(ix3,jx3);
2695             dy33             = _mm_sub_ps(iy3,jy3);
2696             dz33             = _mm_sub_ps(iz3,jz3);
2697
2698             /* Calculate squared distance and things based on it */
2699             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2700             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2701             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2702             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2703             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2704             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2705             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2706             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2707             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2708             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2709
2710             rinv00           = sse2_invsqrt_f(rsq00);
2711             rinv11           = sse2_invsqrt_f(rsq11);
2712             rinv12           = sse2_invsqrt_f(rsq12);
2713             rinv13           = sse2_invsqrt_f(rsq13);
2714             rinv21           = sse2_invsqrt_f(rsq21);
2715             rinv22           = sse2_invsqrt_f(rsq22);
2716             rinv23           = sse2_invsqrt_f(rsq23);
2717             rinv31           = sse2_invsqrt_f(rsq31);
2718             rinv32           = sse2_invsqrt_f(rsq32);
2719             rinv33           = sse2_invsqrt_f(rsq33);
2720
2721             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2722             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2723             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2724             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2725             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2726             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2727             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2728             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2729             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2730             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2731
2732             fjx0             = _mm_setzero_ps();
2733             fjy0             = _mm_setzero_ps();
2734             fjz0             = _mm_setzero_ps();
2735             fjx1             = _mm_setzero_ps();
2736             fjy1             = _mm_setzero_ps();
2737             fjz1             = _mm_setzero_ps();
2738             fjx2             = _mm_setzero_ps();
2739             fjy2             = _mm_setzero_ps();
2740             fjz2             = _mm_setzero_ps();
2741             fjx3             = _mm_setzero_ps();
2742             fjy3             = _mm_setzero_ps();
2743             fjz3             = _mm_setzero_ps();
2744
2745             /**************************
2746              * CALCULATE INTERACTIONS *
2747              **************************/
2748
2749             if (gmx_mm_any_lt(rsq00,rcutoff2))
2750             {
2751
2752             r00              = _mm_mul_ps(rsq00,rinv00);
2753             r00              = _mm_andnot_ps(dummy_mask,r00);
2754
2755             /* LENNARD-JONES DISPERSION/REPULSION */
2756
2757             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2758             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
2759             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2760             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2761             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2762
2763             d                = _mm_sub_ps(r00,rswitch);
2764             d                = _mm_max_ps(d,_mm_setzero_ps());
2765             d2               = _mm_mul_ps(d,d);
2766             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2767
2768             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2769
2770             /* Evaluate switch function */
2771             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2772             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2773             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2774
2775             fscal            = fvdw;
2776
2777             fscal            = _mm_and_ps(fscal,cutoff_mask);
2778
2779             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2780
2781             /* Calculate temporary vectorial force */
2782             tx               = _mm_mul_ps(fscal,dx00);
2783             ty               = _mm_mul_ps(fscal,dy00);
2784             tz               = _mm_mul_ps(fscal,dz00);
2785
2786             /* Update vectorial force */
2787             fix0             = _mm_add_ps(fix0,tx);
2788             fiy0             = _mm_add_ps(fiy0,ty);
2789             fiz0             = _mm_add_ps(fiz0,tz);
2790
2791             fjx0             = _mm_add_ps(fjx0,tx);
2792             fjy0             = _mm_add_ps(fjy0,ty);
2793             fjz0             = _mm_add_ps(fjz0,tz);
2794             
2795             }
2796
2797             /**************************
2798              * CALCULATE INTERACTIONS *
2799              **************************/
2800
2801             if (gmx_mm_any_lt(rsq11,rcutoff2))
2802             {
2803
2804             r11              = _mm_mul_ps(rsq11,rinv11);
2805             r11              = _mm_andnot_ps(dummy_mask,r11);
2806
2807             /* EWALD ELECTROSTATICS */
2808
2809             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2810             ewrt             = _mm_mul_ps(r11,ewtabscale);
2811             ewitab           = _mm_cvttps_epi32(ewrt);
2812             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2813             ewitab           = _mm_slli_epi32(ewitab,2);
2814             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2815             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2816             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2817             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2818             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2819             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2820             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2821             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2822             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2823
2824             d                = _mm_sub_ps(r11,rswitch);
2825             d                = _mm_max_ps(d,_mm_setzero_ps());
2826             d2               = _mm_mul_ps(d,d);
2827             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2828
2829             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2830
2831             /* Evaluate switch function */
2832             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2833             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2834             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2835
2836             fscal            = felec;
2837
2838             fscal            = _mm_and_ps(fscal,cutoff_mask);
2839
2840             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2841
2842             /* Calculate temporary vectorial force */
2843             tx               = _mm_mul_ps(fscal,dx11);
2844             ty               = _mm_mul_ps(fscal,dy11);
2845             tz               = _mm_mul_ps(fscal,dz11);
2846
2847             /* Update vectorial force */
2848             fix1             = _mm_add_ps(fix1,tx);
2849             fiy1             = _mm_add_ps(fiy1,ty);
2850             fiz1             = _mm_add_ps(fiz1,tz);
2851
2852             fjx1             = _mm_add_ps(fjx1,tx);
2853             fjy1             = _mm_add_ps(fjy1,ty);
2854             fjz1             = _mm_add_ps(fjz1,tz);
2855             
2856             }
2857
2858             /**************************
2859              * CALCULATE INTERACTIONS *
2860              **************************/
2861
2862             if (gmx_mm_any_lt(rsq12,rcutoff2))
2863             {
2864
2865             r12              = _mm_mul_ps(rsq12,rinv12);
2866             r12              = _mm_andnot_ps(dummy_mask,r12);
2867
2868             /* EWALD ELECTROSTATICS */
2869
2870             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2871             ewrt             = _mm_mul_ps(r12,ewtabscale);
2872             ewitab           = _mm_cvttps_epi32(ewrt);
2873             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2874             ewitab           = _mm_slli_epi32(ewitab,2);
2875             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2876             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2877             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2878             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2879             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2880             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2881             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2882             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2883             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2884
2885             d                = _mm_sub_ps(r12,rswitch);
2886             d                = _mm_max_ps(d,_mm_setzero_ps());
2887             d2               = _mm_mul_ps(d,d);
2888             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2889
2890             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2891
2892             /* Evaluate switch function */
2893             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2894             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2895             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2896
2897             fscal            = felec;
2898
2899             fscal            = _mm_and_ps(fscal,cutoff_mask);
2900
2901             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2902
2903             /* Calculate temporary vectorial force */
2904             tx               = _mm_mul_ps(fscal,dx12);
2905             ty               = _mm_mul_ps(fscal,dy12);
2906             tz               = _mm_mul_ps(fscal,dz12);
2907
2908             /* Update vectorial force */
2909             fix1             = _mm_add_ps(fix1,tx);
2910             fiy1             = _mm_add_ps(fiy1,ty);
2911             fiz1             = _mm_add_ps(fiz1,tz);
2912
2913             fjx2             = _mm_add_ps(fjx2,tx);
2914             fjy2             = _mm_add_ps(fjy2,ty);
2915             fjz2             = _mm_add_ps(fjz2,tz);
2916             
2917             }
2918
2919             /**************************
2920              * CALCULATE INTERACTIONS *
2921              **************************/
2922
2923             if (gmx_mm_any_lt(rsq13,rcutoff2))
2924             {
2925
2926             r13              = _mm_mul_ps(rsq13,rinv13);
2927             r13              = _mm_andnot_ps(dummy_mask,r13);
2928
2929             /* EWALD ELECTROSTATICS */
2930
2931             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2932             ewrt             = _mm_mul_ps(r13,ewtabscale);
2933             ewitab           = _mm_cvttps_epi32(ewrt);
2934             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2935             ewitab           = _mm_slli_epi32(ewitab,2);
2936             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2937             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2938             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2939             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2940             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2941             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2942             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2943             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2944             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2945
2946             d                = _mm_sub_ps(r13,rswitch);
2947             d                = _mm_max_ps(d,_mm_setzero_ps());
2948             d2               = _mm_mul_ps(d,d);
2949             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2950
2951             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2952
2953             /* Evaluate switch function */
2954             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2955             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2956             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2957
2958             fscal            = felec;
2959
2960             fscal            = _mm_and_ps(fscal,cutoff_mask);
2961
2962             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2963
2964             /* Calculate temporary vectorial force */
2965             tx               = _mm_mul_ps(fscal,dx13);
2966             ty               = _mm_mul_ps(fscal,dy13);
2967             tz               = _mm_mul_ps(fscal,dz13);
2968
2969             /* Update vectorial force */
2970             fix1             = _mm_add_ps(fix1,tx);
2971             fiy1             = _mm_add_ps(fiy1,ty);
2972             fiz1             = _mm_add_ps(fiz1,tz);
2973
2974             fjx3             = _mm_add_ps(fjx3,tx);
2975             fjy3             = _mm_add_ps(fjy3,ty);
2976             fjz3             = _mm_add_ps(fjz3,tz);
2977             
2978             }
2979
2980             /**************************
2981              * CALCULATE INTERACTIONS *
2982              **************************/
2983
2984             if (gmx_mm_any_lt(rsq21,rcutoff2))
2985             {
2986
2987             r21              = _mm_mul_ps(rsq21,rinv21);
2988             r21              = _mm_andnot_ps(dummy_mask,r21);
2989
2990             /* EWALD ELECTROSTATICS */
2991
2992             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2993             ewrt             = _mm_mul_ps(r21,ewtabscale);
2994             ewitab           = _mm_cvttps_epi32(ewrt);
2995             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2996             ewitab           = _mm_slli_epi32(ewitab,2);
2997             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2998             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2999             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3000             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3001             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3002             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3003             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3004             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
3005             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
3006
3007             d                = _mm_sub_ps(r21,rswitch);
3008             d                = _mm_max_ps(d,_mm_setzero_ps());
3009             d2               = _mm_mul_ps(d,d);
3010             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3011
3012             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3013
3014             /* Evaluate switch function */
3015             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3016             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
3017             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
3018
3019             fscal            = felec;
3020
3021             fscal            = _mm_and_ps(fscal,cutoff_mask);
3022
3023             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3024
3025             /* Calculate temporary vectorial force */
3026             tx               = _mm_mul_ps(fscal,dx21);
3027             ty               = _mm_mul_ps(fscal,dy21);
3028             tz               = _mm_mul_ps(fscal,dz21);
3029
3030             /* Update vectorial force */
3031             fix2             = _mm_add_ps(fix2,tx);
3032             fiy2             = _mm_add_ps(fiy2,ty);
3033             fiz2             = _mm_add_ps(fiz2,tz);
3034
3035             fjx1             = _mm_add_ps(fjx1,tx);
3036             fjy1             = _mm_add_ps(fjy1,ty);
3037             fjz1             = _mm_add_ps(fjz1,tz);
3038             
3039             }
3040
3041             /**************************
3042              * CALCULATE INTERACTIONS *
3043              **************************/
3044
3045             if (gmx_mm_any_lt(rsq22,rcutoff2))
3046             {
3047
3048             r22              = _mm_mul_ps(rsq22,rinv22);
3049             r22              = _mm_andnot_ps(dummy_mask,r22);
3050
3051             /* EWALD ELECTROSTATICS */
3052
3053             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3054             ewrt             = _mm_mul_ps(r22,ewtabscale);
3055             ewitab           = _mm_cvttps_epi32(ewrt);
3056             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3057             ewitab           = _mm_slli_epi32(ewitab,2);
3058             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3059             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3060             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3061             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3062             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3063             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3064             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3065             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3066             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3067
3068             d                = _mm_sub_ps(r22,rswitch);
3069             d                = _mm_max_ps(d,_mm_setzero_ps());
3070             d2               = _mm_mul_ps(d,d);
3071             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3072
3073             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3074
3075             /* Evaluate switch function */
3076             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3077             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3078             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
3079
3080             fscal            = felec;
3081
3082             fscal            = _mm_and_ps(fscal,cutoff_mask);
3083
3084             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3085
3086             /* Calculate temporary vectorial force */
3087             tx               = _mm_mul_ps(fscal,dx22);
3088             ty               = _mm_mul_ps(fscal,dy22);
3089             tz               = _mm_mul_ps(fscal,dz22);
3090
3091             /* Update vectorial force */
3092             fix2             = _mm_add_ps(fix2,tx);
3093             fiy2             = _mm_add_ps(fiy2,ty);
3094             fiz2             = _mm_add_ps(fiz2,tz);
3095
3096             fjx2             = _mm_add_ps(fjx2,tx);
3097             fjy2             = _mm_add_ps(fjy2,ty);
3098             fjz2             = _mm_add_ps(fjz2,tz);
3099             
3100             }
3101
3102             /**************************
3103              * CALCULATE INTERACTIONS *
3104              **************************/
3105
3106             if (gmx_mm_any_lt(rsq23,rcutoff2))
3107             {
3108
3109             r23              = _mm_mul_ps(rsq23,rinv23);
3110             r23              = _mm_andnot_ps(dummy_mask,r23);
3111
3112             /* EWALD ELECTROSTATICS */
3113
3114             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3115             ewrt             = _mm_mul_ps(r23,ewtabscale);
3116             ewitab           = _mm_cvttps_epi32(ewrt);
3117             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3118             ewitab           = _mm_slli_epi32(ewitab,2);
3119             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3120             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3121             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3122             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3123             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3124             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3125             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3126             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
3127             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
3128
3129             d                = _mm_sub_ps(r23,rswitch);
3130             d                = _mm_max_ps(d,_mm_setzero_ps());
3131             d2               = _mm_mul_ps(d,d);
3132             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3133
3134             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3135
3136             /* Evaluate switch function */
3137             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3138             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
3139             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
3140
3141             fscal            = felec;
3142
3143             fscal            = _mm_and_ps(fscal,cutoff_mask);
3144
3145             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3146
3147             /* Calculate temporary vectorial force */
3148             tx               = _mm_mul_ps(fscal,dx23);
3149             ty               = _mm_mul_ps(fscal,dy23);
3150             tz               = _mm_mul_ps(fscal,dz23);
3151
3152             /* Update vectorial force */
3153             fix2             = _mm_add_ps(fix2,tx);
3154             fiy2             = _mm_add_ps(fiy2,ty);
3155             fiz2             = _mm_add_ps(fiz2,tz);
3156
3157             fjx3             = _mm_add_ps(fjx3,tx);
3158             fjy3             = _mm_add_ps(fjy3,ty);
3159             fjz3             = _mm_add_ps(fjz3,tz);
3160             
3161             }
3162
3163             /**************************
3164              * CALCULATE INTERACTIONS *
3165              **************************/
3166
3167             if (gmx_mm_any_lt(rsq31,rcutoff2))
3168             {
3169
3170             r31              = _mm_mul_ps(rsq31,rinv31);
3171             r31              = _mm_andnot_ps(dummy_mask,r31);
3172
3173             /* EWALD ELECTROSTATICS */
3174
3175             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3176             ewrt             = _mm_mul_ps(r31,ewtabscale);
3177             ewitab           = _mm_cvttps_epi32(ewrt);
3178             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3179             ewitab           = _mm_slli_epi32(ewitab,2);
3180             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3181             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3182             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3183             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3184             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3185             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3186             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3187             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
3188             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
3189
3190             d                = _mm_sub_ps(r31,rswitch);
3191             d                = _mm_max_ps(d,_mm_setzero_ps());
3192             d2               = _mm_mul_ps(d,d);
3193             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3194
3195             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3196
3197             /* Evaluate switch function */
3198             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3199             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
3200             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
3201
3202             fscal            = felec;
3203
3204             fscal            = _mm_and_ps(fscal,cutoff_mask);
3205
3206             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3207
3208             /* Calculate temporary vectorial force */
3209             tx               = _mm_mul_ps(fscal,dx31);
3210             ty               = _mm_mul_ps(fscal,dy31);
3211             tz               = _mm_mul_ps(fscal,dz31);
3212
3213             /* Update vectorial force */
3214             fix3             = _mm_add_ps(fix3,tx);
3215             fiy3             = _mm_add_ps(fiy3,ty);
3216             fiz3             = _mm_add_ps(fiz3,tz);
3217
3218             fjx1             = _mm_add_ps(fjx1,tx);
3219             fjy1             = _mm_add_ps(fjy1,ty);
3220             fjz1             = _mm_add_ps(fjz1,tz);
3221             
3222             }
3223
3224             /**************************
3225              * CALCULATE INTERACTIONS *
3226              **************************/
3227
3228             if (gmx_mm_any_lt(rsq32,rcutoff2))
3229             {
3230
3231             r32              = _mm_mul_ps(rsq32,rinv32);
3232             r32              = _mm_andnot_ps(dummy_mask,r32);
3233
3234             /* EWALD ELECTROSTATICS */
3235
3236             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3237             ewrt             = _mm_mul_ps(r32,ewtabscale);
3238             ewitab           = _mm_cvttps_epi32(ewrt);
3239             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3240             ewitab           = _mm_slli_epi32(ewitab,2);
3241             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3242             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3243             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3244             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3245             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3246             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3247             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3248             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
3249             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
3250
3251             d                = _mm_sub_ps(r32,rswitch);
3252             d                = _mm_max_ps(d,_mm_setzero_ps());
3253             d2               = _mm_mul_ps(d,d);
3254             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3255
3256             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3257
3258             /* Evaluate switch function */
3259             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3260             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
3261             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
3262
3263             fscal            = felec;
3264
3265             fscal            = _mm_and_ps(fscal,cutoff_mask);
3266
3267             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3268
3269             /* Calculate temporary vectorial force */
3270             tx               = _mm_mul_ps(fscal,dx32);
3271             ty               = _mm_mul_ps(fscal,dy32);
3272             tz               = _mm_mul_ps(fscal,dz32);
3273
3274             /* Update vectorial force */
3275             fix3             = _mm_add_ps(fix3,tx);
3276             fiy3             = _mm_add_ps(fiy3,ty);
3277             fiz3             = _mm_add_ps(fiz3,tz);
3278
3279             fjx2             = _mm_add_ps(fjx2,tx);
3280             fjy2             = _mm_add_ps(fjy2,ty);
3281             fjz2             = _mm_add_ps(fjz2,tz);
3282             
3283             }
3284
3285             /**************************
3286              * CALCULATE INTERACTIONS *
3287              **************************/
3288
3289             if (gmx_mm_any_lt(rsq33,rcutoff2))
3290             {
3291
3292             r33              = _mm_mul_ps(rsq33,rinv33);
3293             r33              = _mm_andnot_ps(dummy_mask,r33);
3294
3295             /* EWALD ELECTROSTATICS */
3296
3297             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3298             ewrt             = _mm_mul_ps(r33,ewtabscale);
3299             ewitab           = _mm_cvttps_epi32(ewrt);
3300             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3301             ewitab           = _mm_slli_epi32(ewitab,2);
3302             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3303             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3304             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3305             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3306             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3307             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3308             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3309             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
3310             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
3311
3312             d                = _mm_sub_ps(r33,rswitch);
3313             d                = _mm_max_ps(d,_mm_setzero_ps());
3314             d2               = _mm_mul_ps(d,d);
3315             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3316
3317             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3318
3319             /* Evaluate switch function */
3320             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3321             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
3322             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
3323
3324             fscal            = felec;
3325
3326             fscal            = _mm_and_ps(fscal,cutoff_mask);
3327
3328             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3329
3330             /* Calculate temporary vectorial force */
3331             tx               = _mm_mul_ps(fscal,dx33);
3332             ty               = _mm_mul_ps(fscal,dy33);
3333             tz               = _mm_mul_ps(fscal,dz33);
3334
3335             /* Update vectorial force */
3336             fix3             = _mm_add_ps(fix3,tx);
3337             fiy3             = _mm_add_ps(fiy3,ty);
3338             fiz3             = _mm_add_ps(fiz3,tz);
3339
3340             fjx3             = _mm_add_ps(fjx3,tx);
3341             fjy3             = _mm_add_ps(fjy3,ty);
3342             fjz3             = _mm_add_ps(fjz3,tz);
3343             
3344             }
3345
3346             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3347             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3348             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3349             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3350
3351             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3352                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3353                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3354
3355             /* Inner loop uses 627 flops */
3356         }
3357
3358         /* End of innermost loop */
3359
3360         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3361                                               f+i_coord_offset,fshift+i_shift_offset);
3362
3363         /* Increment number of inner iterations */
3364         inneriter                  += j_index_end - j_index_start;
3365
3366         /* Outer loop uses 24 flops */
3367     }
3368
3369     /* Increment number of outer iterations */
3370     outeriter        += nri;
3371
3372     /* Update outer/inner flops */
3373
3374     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*627);
3375 }