Merge 'release-4-6' into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sse2_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
62     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
63     real             shX,shY,shZ,rcutoff_scalar;
64     real             *shiftvec,*fshift,*x,*f;
65     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
66     int              vdwioffset0;
67     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
68     int              vdwioffset1;
69     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
70     int              vdwioffset2;
71     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
72     int              vdwioffset3;
73     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
75     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
77     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
78     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
79     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
80     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
81     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
82     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
84     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
85     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
86     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
87     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
88     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
89     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
90     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
91     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
92     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
93     real             *charge;
94     int              nvdwtype;
95     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
96     int              *vdwtype;
97     real             *vdwparam;
98     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
99     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
100     __m128i          ewitab;
101     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
102     real             *ewtab;
103     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
104     real             rswitch_scalar,d_scalar;
105     __m128           dummy_mask,cutoff_mask;
106     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
107     __m128           one     = _mm_set1_ps(1.0);
108     __m128           two     = _mm_set1_ps(2.0);
109     x                = xx[0];
110     f                = ff[0];
111
112     nri              = nlist->nri;
113     iinr             = nlist->iinr;
114     jindex           = nlist->jindex;
115     jjnr             = nlist->jjnr;
116     shiftidx         = nlist->shift;
117     gid              = nlist->gid;
118     shiftvec         = fr->shift_vec[0];
119     fshift           = fr->fshift[0];
120     facel            = _mm_set1_ps(fr->epsfac);
121     charge           = mdatoms->chargeA;
122     nvdwtype         = fr->ntype;
123     vdwparam         = fr->nbfp;
124     vdwtype          = mdatoms->typeA;
125
126     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
127     ewtab            = fr->ic->tabq_coul_FDV0;
128     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
129     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
130
131     /* Setup water-specific parameters */
132     inr              = nlist->iinr[0];
133     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
134     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
135     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
136     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
137
138     jq1              = _mm_set1_ps(charge[inr+1]);
139     jq2              = _mm_set1_ps(charge[inr+2]);
140     jq3              = _mm_set1_ps(charge[inr+3]);
141     vdwjidx0A        = 2*vdwtype[inr+0];
142     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
143     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
144     qq11             = _mm_mul_ps(iq1,jq1);
145     qq12             = _mm_mul_ps(iq1,jq2);
146     qq13             = _mm_mul_ps(iq1,jq3);
147     qq21             = _mm_mul_ps(iq2,jq1);
148     qq22             = _mm_mul_ps(iq2,jq2);
149     qq23             = _mm_mul_ps(iq2,jq3);
150     qq31             = _mm_mul_ps(iq3,jq1);
151     qq32             = _mm_mul_ps(iq3,jq2);
152     qq33             = _mm_mul_ps(iq3,jq3);
153
154     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
155     rcutoff_scalar   = fr->rcoulomb;
156     rcutoff          = _mm_set1_ps(rcutoff_scalar);
157     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
158
159     rswitch_scalar   = fr->rcoulomb_switch;
160     rswitch          = _mm_set1_ps(rswitch_scalar);
161     /* Setup switch parameters */
162     d_scalar         = rcutoff_scalar-rswitch_scalar;
163     d                = _mm_set1_ps(d_scalar);
164     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
165     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
166     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
167     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
168     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
169     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
170
171     /* Avoid stupid compiler warnings */
172     jnrA = jnrB = jnrC = jnrD = 0;
173     j_coord_offsetA = 0;
174     j_coord_offsetB = 0;
175     j_coord_offsetC = 0;
176     j_coord_offsetD = 0;
177
178     outeriter        = 0;
179     inneriter        = 0;
180
181     /* Start outer loop over neighborlists */
182     for(iidx=0; iidx<nri; iidx++)
183     {
184         /* Load shift vector for this list */
185         i_shift_offset   = DIM*shiftidx[iidx];
186         shX              = shiftvec[i_shift_offset+XX];
187         shY              = shiftvec[i_shift_offset+YY];
188         shZ              = shiftvec[i_shift_offset+ZZ];
189
190         /* Load limits for loop over neighbors */
191         j_index_start    = jindex[iidx];
192         j_index_end      = jindex[iidx+1];
193
194         /* Get outer coordinate index */
195         inr              = iinr[iidx];
196         i_coord_offset   = DIM*inr;
197
198         /* Load i particle coords and add shift vector */
199         ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
200         iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
201         iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
202         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
203         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
204         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
205         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
206         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
207         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
208         ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
209         iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
210         iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
211
212         fix0             = _mm_setzero_ps();
213         fiy0             = _mm_setzero_ps();
214         fiz0             = _mm_setzero_ps();
215         fix1             = _mm_setzero_ps();
216         fiy1             = _mm_setzero_ps();
217         fiz1             = _mm_setzero_ps();
218         fix2             = _mm_setzero_ps();
219         fiy2             = _mm_setzero_ps();
220         fiz2             = _mm_setzero_ps();
221         fix3             = _mm_setzero_ps();
222         fiy3             = _mm_setzero_ps();
223         fiz3             = _mm_setzero_ps();
224
225         /* Reset potential sums */
226         velecsum         = _mm_setzero_ps();
227         vvdwsum          = _mm_setzero_ps();
228
229         /* Start inner kernel loop */
230         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
231         {
232
233             /* Get j neighbor index, and coordinate index */
234             jnrA             = jjnr[jidx];
235             jnrB             = jjnr[jidx+1];
236             jnrC             = jjnr[jidx+2];
237             jnrD             = jjnr[jidx+3];
238
239             j_coord_offsetA  = DIM*jnrA;
240             j_coord_offsetB  = DIM*jnrB;
241             j_coord_offsetC  = DIM*jnrC;
242             j_coord_offsetD  = DIM*jnrD;
243
244             /* load j atom coordinates */
245             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
246                                               x+j_coord_offsetC,x+j_coord_offsetD,
247                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
248                                               &jy2,&jz2,&jx3,&jy3,&jz3);
249
250             /* Calculate displacement vector */
251             dx00             = _mm_sub_ps(ix0,jx0);
252             dy00             = _mm_sub_ps(iy0,jy0);
253             dz00             = _mm_sub_ps(iz0,jz0);
254             dx11             = _mm_sub_ps(ix1,jx1);
255             dy11             = _mm_sub_ps(iy1,jy1);
256             dz11             = _mm_sub_ps(iz1,jz1);
257             dx12             = _mm_sub_ps(ix1,jx2);
258             dy12             = _mm_sub_ps(iy1,jy2);
259             dz12             = _mm_sub_ps(iz1,jz2);
260             dx13             = _mm_sub_ps(ix1,jx3);
261             dy13             = _mm_sub_ps(iy1,jy3);
262             dz13             = _mm_sub_ps(iz1,jz3);
263             dx21             = _mm_sub_ps(ix2,jx1);
264             dy21             = _mm_sub_ps(iy2,jy1);
265             dz21             = _mm_sub_ps(iz2,jz1);
266             dx22             = _mm_sub_ps(ix2,jx2);
267             dy22             = _mm_sub_ps(iy2,jy2);
268             dz22             = _mm_sub_ps(iz2,jz2);
269             dx23             = _mm_sub_ps(ix2,jx3);
270             dy23             = _mm_sub_ps(iy2,jy3);
271             dz23             = _mm_sub_ps(iz2,jz3);
272             dx31             = _mm_sub_ps(ix3,jx1);
273             dy31             = _mm_sub_ps(iy3,jy1);
274             dz31             = _mm_sub_ps(iz3,jz1);
275             dx32             = _mm_sub_ps(ix3,jx2);
276             dy32             = _mm_sub_ps(iy3,jy2);
277             dz32             = _mm_sub_ps(iz3,jz2);
278             dx33             = _mm_sub_ps(ix3,jx3);
279             dy33             = _mm_sub_ps(iy3,jy3);
280             dz33             = _mm_sub_ps(iz3,jz3);
281
282             /* Calculate squared distance and things based on it */
283             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
284             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
285             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
286             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
287             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
288             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
289             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
290             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
291             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
292             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
293
294             rinv00           = gmx_mm_invsqrt_ps(rsq00);
295             rinv11           = gmx_mm_invsqrt_ps(rsq11);
296             rinv12           = gmx_mm_invsqrt_ps(rsq12);
297             rinv13           = gmx_mm_invsqrt_ps(rsq13);
298             rinv21           = gmx_mm_invsqrt_ps(rsq21);
299             rinv22           = gmx_mm_invsqrt_ps(rsq22);
300             rinv23           = gmx_mm_invsqrt_ps(rsq23);
301             rinv31           = gmx_mm_invsqrt_ps(rsq31);
302             rinv32           = gmx_mm_invsqrt_ps(rsq32);
303             rinv33           = gmx_mm_invsqrt_ps(rsq33);
304
305             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
306             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
307             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
308             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
309             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
310             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
311             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
312             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
313             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
314             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
315
316             fjx0             = _mm_setzero_ps();
317             fjy0             = _mm_setzero_ps();
318             fjz0             = _mm_setzero_ps();
319             fjx1             = _mm_setzero_ps();
320             fjy1             = _mm_setzero_ps();
321             fjz1             = _mm_setzero_ps();
322             fjx2             = _mm_setzero_ps();
323             fjy2             = _mm_setzero_ps();
324             fjz2             = _mm_setzero_ps();
325             fjx3             = _mm_setzero_ps();
326             fjy3             = _mm_setzero_ps();
327             fjz3             = _mm_setzero_ps();
328
329             /**************************
330              * CALCULATE INTERACTIONS *
331              **************************/
332
333             if (gmx_mm_any_lt(rsq00,rcutoff2))
334             {
335
336             r00              = _mm_mul_ps(rsq00,rinv00);
337
338             /* LENNARD-JONES DISPERSION/REPULSION */
339
340             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
341             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
342             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
343             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
344             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
345
346             d                = _mm_sub_ps(r00,rswitch);
347             d                = _mm_max_ps(d,_mm_setzero_ps());
348             d2               = _mm_mul_ps(d,d);
349             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
350
351             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
352
353             /* Evaluate switch function */
354             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
355             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
356             vvdw             = _mm_mul_ps(vvdw,sw);
357             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
358
359             /* Update potential sum for this i atom from the interaction with this j atom. */
360             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
361             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
362
363             fscal            = fvdw;
364
365             fscal            = _mm_and_ps(fscal,cutoff_mask);
366
367             /* Calculate temporary vectorial force */
368             tx               = _mm_mul_ps(fscal,dx00);
369             ty               = _mm_mul_ps(fscal,dy00);
370             tz               = _mm_mul_ps(fscal,dz00);
371
372             /* Update vectorial force */
373             fix0             = _mm_add_ps(fix0,tx);
374             fiy0             = _mm_add_ps(fiy0,ty);
375             fiz0             = _mm_add_ps(fiz0,tz);
376
377             fjx0             = _mm_add_ps(fjx0,tx);
378             fjy0             = _mm_add_ps(fjy0,ty);
379             fjz0             = _mm_add_ps(fjz0,tz);
380
381             }
382
383             /**************************
384              * CALCULATE INTERACTIONS *
385              **************************/
386
387             if (gmx_mm_any_lt(rsq11,rcutoff2))
388             {
389
390             r11              = _mm_mul_ps(rsq11,rinv11);
391
392             /* EWALD ELECTROSTATICS */
393
394             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
395             ewrt             = _mm_mul_ps(r11,ewtabscale);
396             ewitab           = _mm_cvttps_epi32(ewrt);
397             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
398             ewitab           = _mm_slli_epi32(ewitab,2);
399             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
400             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
401             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
402             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
403             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
404             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
405             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
406             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
407             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
408
409             d                = _mm_sub_ps(r11,rswitch);
410             d                = _mm_max_ps(d,_mm_setzero_ps());
411             d2               = _mm_mul_ps(d,d);
412             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
413
414             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
415
416             /* Evaluate switch function */
417             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
418             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
419             velec            = _mm_mul_ps(velec,sw);
420             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
421
422             /* Update potential sum for this i atom from the interaction with this j atom. */
423             velec            = _mm_and_ps(velec,cutoff_mask);
424             velecsum         = _mm_add_ps(velecsum,velec);
425
426             fscal            = felec;
427
428             fscal            = _mm_and_ps(fscal,cutoff_mask);
429
430             /* Calculate temporary vectorial force */
431             tx               = _mm_mul_ps(fscal,dx11);
432             ty               = _mm_mul_ps(fscal,dy11);
433             tz               = _mm_mul_ps(fscal,dz11);
434
435             /* Update vectorial force */
436             fix1             = _mm_add_ps(fix1,tx);
437             fiy1             = _mm_add_ps(fiy1,ty);
438             fiz1             = _mm_add_ps(fiz1,tz);
439
440             fjx1             = _mm_add_ps(fjx1,tx);
441             fjy1             = _mm_add_ps(fjy1,ty);
442             fjz1             = _mm_add_ps(fjz1,tz);
443
444             }
445
446             /**************************
447              * CALCULATE INTERACTIONS *
448              **************************/
449
450             if (gmx_mm_any_lt(rsq12,rcutoff2))
451             {
452
453             r12              = _mm_mul_ps(rsq12,rinv12);
454
455             /* EWALD ELECTROSTATICS */
456
457             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
458             ewrt             = _mm_mul_ps(r12,ewtabscale);
459             ewitab           = _mm_cvttps_epi32(ewrt);
460             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
461             ewitab           = _mm_slli_epi32(ewitab,2);
462             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
463             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
464             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
465             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
466             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
467             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
468             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
469             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
470             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
471
472             d                = _mm_sub_ps(r12,rswitch);
473             d                = _mm_max_ps(d,_mm_setzero_ps());
474             d2               = _mm_mul_ps(d,d);
475             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
476
477             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
478
479             /* Evaluate switch function */
480             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
481             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
482             velec            = _mm_mul_ps(velec,sw);
483             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
484
485             /* Update potential sum for this i atom from the interaction with this j atom. */
486             velec            = _mm_and_ps(velec,cutoff_mask);
487             velecsum         = _mm_add_ps(velecsum,velec);
488
489             fscal            = felec;
490
491             fscal            = _mm_and_ps(fscal,cutoff_mask);
492
493             /* Calculate temporary vectorial force */
494             tx               = _mm_mul_ps(fscal,dx12);
495             ty               = _mm_mul_ps(fscal,dy12);
496             tz               = _mm_mul_ps(fscal,dz12);
497
498             /* Update vectorial force */
499             fix1             = _mm_add_ps(fix1,tx);
500             fiy1             = _mm_add_ps(fiy1,ty);
501             fiz1             = _mm_add_ps(fiz1,tz);
502
503             fjx2             = _mm_add_ps(fjx2,tx);
504             fjy2             = _mm_add_ps(fjy2,ty);
505             fjz2             = _mm_add_ps(fjz2,tz);
506
507             }
508
509             /**************************
510              * CALCULATE INTERACTIONS *
511              **************************/
512
513             if (gmx_mm_any_lt(rsq13,rcutoff2))
514             {
515
516             r13              = _mm_mul_ps(rsq13,rinv13);
517
518             /* EWALD ELECTROSTATICS */
519
520             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
521             ewrt             = _mm_mul_ps(r13,ewtabscale);
522             ewitab           = _mm_cvttps_epi32(ewrt);
523             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
524             ewitab           = _mm_slli_epi32(ewitab,2);
525             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
526             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
527             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
528             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
529             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
530             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
531             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
532             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
533             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
534
535             d                = _mm_sub_ps(r13,rswitch);
536             d                = _mm_max_ps(d,_mm_setzero_ps());
537             d2               = _mm_mul_ps(d,d);
538             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
539
540             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
541
542             /* Evaluate switch function */
543             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
544             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
545             velec            = _mm_mul_ps(velec,sw);
546             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
547
548             /* Update potential sum for this i atom from the interaction with this j atom. */
549             velec            = _mm_and_ps(velec,cutoff_mask);
550             velecsum         = _mm_add_ps(velecsum,velec);
551
552             fscal            = felec;
553
554             fscal            = _mm_and_ps(fscal,cutoff_mask);
555
556             /* Calculate temporary vectorial force */
557             tx               = _mm_mul_ps(fscal,dx13);
558             ty               = _mm_mul_ps(fscal,dy13);
559             tz               = _mm_mul_ps(fscal,dz13);
560
561             /* Update vectorial force */
562             fix1             = _mm_add_ps(fix1,tx);
563             fiy1             = _mm_add_ps(fiy1,ty);
564             fiz1             = _mm_add_ps(fiz1,tz);
565
566             fjx3             = _mm_add_ps(fjx3,tx);
567             fjy3             = _mm_add_ps(fjy3,ty);
568             fjz3             = _mm_add_ps(fjz3,tz);
569
570             }
571
572             /**************************
573              * CALCULATE INTERACTIONS *
574              **************************/
575
576             if (gmx_mm_any_lt(rsq21,rcutoff2))
577             {
578
579             r21              = _mm_mul_ps(rsq21,rinv21);
580
581             /* EWALD ELECTROSTATICS */
582
583             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
584             ewrt             = _mm_mul_ps(r21,ewtabscale);
585             ewitab           = _mm_cvttps_epi32(ewrt);
586             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
587             ewitab           = _mm_slli_epi32(ewitab,2);
588             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
589             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
590             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
591             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
592             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
593             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
594             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
595             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
596             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
597
598             d                = _mm_sub_ps(r21,rswitch);
599             d                = _mm_max_ps(d,_mm_setzero_ps());
600             d2               = _mm_mul_ps(d,d);
601             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
602
603             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
604
605             /* Evaluate switch function */
606             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
607             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
608             velec            = _mm_mul_ps(velec,sw);
609             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
610
611             /* Update potential sum for this i atom from the interaction with this j atom. */
612             velec            = _mm_and_ps(velec,cutoff_mask);
613             velecsum         = _mm_add_ps(velecsum,velec);
614
615             fscal            = felec;
616
617             fscal            = _mm_and_ps(fscal,cutoff_mask);
618
619             /* Calculate temporary vectorial force */
620             tx               = _mm_mul_ps(fscal,dx21);
621             ty               = _mm_mul_ps(fscal,dy21);
622             tz               = _mm_mul_ps(fscal,dz21);
623
624             /* Update vectorial force */
625             fix2             = _mm_add_ps(fix2,tx);
626             fiy2             = _mm_add_ps(fiy2,ty);
627             fiz2             = _mm_add_ps(fiz2,tz);
628
629             fjx1             = _mm_add_ps(fjx1,tx);
630             fjy1             = _mm_add_ps(fjy1,ty);
631             fjz1             = _mm_add_ps(fjz1,tz);
632
633             }
634
635             /**************************
636              * CALCULATE INTERACTIONS *
637              **************************/
638
639             if (gmx_mm_any_lt(rsq22,rcutoff2))
640             {
641
642             r22              = _mm_mul_ps(rsq22,rinv22);
643
644             /* EWALD ELECTROSTATICS */
645
646             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
647             ewrt             = _mm_mul_ps(r22,ewtabscale);
648             ewitab           = _mm_cvttps_epi32(ewrt);
649             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
650             ewitab           = _mm_slli_epi32(ewitab,2);
651             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
652             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
653             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
654             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
655             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
656             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
657             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
658             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
659             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
660
661             d                = _mm_sub_ps(r22,rswitch);
662             d                = _mm_max_ps(d,_mm_setzero_ps());
663             d2               = _mm_mul_ps(d,d);
664             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
665
666             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
667
668             /* Evaluate switch function */
669             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
670             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
671             velec            = _mm_mul_ps(velec,sw);
672             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
673
674             /* Update potential sum for this i atom from the interaction with this j atom. */
675             velec            = _mm_and_ps(velec,cutoff_mask);
676             velecsum         = _mm_add_ps(velecsum,velec);
677
678             fscal            = felec;
679
680             fscal            = _mm_and_ps(fscal,cutoff_mask);
681
682             /* Calculate temporary vectorial force */
683             tx               = _mm_mul_ps(fscal,dx22);
684             ty               = _mm_mul_ps(fscal,dy22);
685             tz               = _mm_mul_ps(fscal,dz22);
686
687             /* Update vectorial force */
688             fix2             = _mm_add_ps(fix2,tx);
689             fiy2             = _mm_add_ps(fiy2,ty);
690             fiz2             = _mm_add_ps(fiz2,tz);
691
692             fjx2             = _mm_add_ps(fjx2,tx);
693             fjy2             = _mm_add_ps(fjy2,ty);
694             fjz2             = _mm_add_ps(fjz2,tz);
695
696             }
697
698             /**************************
699              * CALCULATE INTERACTIONS *
700              **************************/
701
702             if (gmx_mm_any_lt(rsq23,rcutoff2))
703             {
704
705             r23              = _mm_mul_ps(rsq23,rinv23);
706
707             /* EWALD ELECTROSTATICS */
708
709             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
710             ewrt             = _mm_mul_ps(r23,ewtabscale);
711             ewitab           = _mm_cvttps_epi32(ewrt);
712             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
713             ewitab           = _mm_slli_epi32(ewitab,2);
714             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
715             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
716             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
717             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
718             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
719             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
720             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
721             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
722             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
723
724             d                = _mm_sub_ps(r23,rswitch);
725             d                = _mm_max_ps(d,_mm_setzero_ps());
726             d2               = _mm_mul_ps(d,d);
727             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
728
729             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
730
731             /* Evaluate switch function */
732             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
733             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
734             velec            = _mm_mul_ps(velec,sw);
735             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
736
737             /* Update potential sum for this i atom from the interaction with this j atom. */
738             velec            = _mm_and_ps(velec,cutoff_mask);
739             velecsum         = _mm_add_ps(velecsum,velec);
740
741             fscal            = felec;
742
743             fscal            = _mm_and_ps(fscal,cutoff_mask);
744
745             /* Calculate temporary vectorial force */
746             tx               = _mm_mul_ps(fscal,dx23);
747             ty               = _mm_mul_ps(fscal,dy23);
748             tz               = _mm_mul_ps(fscal,dz23);
749
750             /* Update vectorial force */
751             fix2             = _mm_add_ps(fix2,tx);
752             fiy2             = _mm_add_ps(fiy2,ty);
753             fiz2             = _mm_add_ps(fiz2,tz);
754
755             fjx3             = _mm_add_ps(fjx3,tx);
756             fjy3             = _mm_add_ps(fjy3,ty);
757             fjz3             = _mm_add_ps(fjz3,tz);
758
759             }
760
761             /**************************
762              * CALCULATE INTERACTIONS *
763              **************************/
764
765             if (gmx_mm_any_lt(rsq31,rcutoff2))
766             {
767
768             r31              = _mm_mul_ps(rsq31,rinv31);
769
770             /* EWALD ELECTROSTATICS */
771
772             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
773             ewrt             = _mm_mul_ps(r31,ewtabscale);
774             ewitab           = _mm_cvttps_epi32(ewrt);
775             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
776             ewitab           = _mm_slli_epi32(ewitab,2);
777             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
778             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
779             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
780             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
781             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
782             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
783             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
784             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
785             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
786
787             d                = _mm_sub_ps(r31,rswitch);
788             d                = _mm_max_ps(d,_mm_setzero_ps());
789             d2               = _mm_mul_ps(d,d);
790             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
791
792             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
793
794             /* Evaluate switch function */
795             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
796             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
797             velec            = _mm_mul_ps(velec,sw);
798             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
799
800             /* Update potential sum for this i atom from the interaction with this j atom. */
801             velec            = _mm_and_ps(velec,cutoff_mask);
802             velecsum         = _mm_add_ps(velecsum,velec);
803
804             fscal            = felec;
805
806             fscal            = _mm_and_ps(fscal,cutoff_mask);
807
808             /* Calculate temporary vectorial force */
809             tx               = _mm_mul_ps(fscal,dx31);
810             ty               = _mm_mul_ps(fscal,dy31);
811             tz               = _mm_mul_ps(fscal,dz31);
812
813             /* Update vectorial force */
814             fix3             = _mm_add_ps(fix3,tx);
815             fiy3             = _mm_add_ps(fiy3,ty);
816             fiz3             = _mm_add_ps(fiz3,tz);
817
818             fjx1             = _mm_add_ps(fjx1,tx);
819             fjy1             = _mm_add_ps(fjy1,ty);
820             fjz1             = _mm_add_ps(fjz1,tz);
821
822             }
823
824             /**************************
825              * CALCULATE INTERACTIONS *
826              **************************/
827
828             if (gmx_mm_any_lt(rsq32,rcutoff2))
829             {
830
831             r32              = _mm_mul_ps(rsq32,rinv32);
832
833             /* EWALD ELECTROSTATICS */
834
835             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
836             ewrt             = _mm_mul_ps(r32,ewtabscale);
837             ewitab           = _mm_cvttps_epi32(ewrt);
838             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
839             ewitab           = _mm_slli_epi32(ewitab,2);
840             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
841             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
842             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
843             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
844             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
845             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
846             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
847             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
848             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
849
850             d                = _mm_sub_ps(r32,rswitch);
851             d                = _mm_max_ps(d,_mm_setzero_ps());
852             d2               = _mm_mul_ps(d,d);
853             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
854
855             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
856
857             /* Evaluate switch function */
858             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
859             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
860             velec            = _mm_mul_ps(velec,sw);
861             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
862
863             /* Update potential sum for this i atom from the interaction with this j atom. */
864             velec            = _mm_and_ps(velec,cutoff_mask);
865             velecsum         = _mm_add_ps(velecsum,velec);
866
867             fscal            = felec;
868
869             fscal            = _mm_and_ps(fscal,cutoff_mask);
870
871             /* Calculate temporary vectorial force */
872             tx               = _mm_mul_ps(fscal,dx32);
873             ty               = _mm_mul_ps(fscal,dy32);
874             tz               = _mm_mul_ps(fscal,dz32);
875
876             /* Update vectorial force */
877             fix3             = _mm_add_ps(fix3,tx);
878             fiy3             = _mm_add_ps(fiy3,ty);
879             fiz3             = _mm_add_ps(fiz3,tz);
880
881             fjx2             = _mm_add_ps(fjx2,tx);
882             fjy2             = _mm_add_ps(fjy2,ty);
883             fjz2             = _mm_add_ps(fjz2,tz);
884
885             }
886
887             /**************************
888              * CALCULATE INTERACTIONS *
889              **************************/
890
891             if (gmx_mm_any_lt(rsq33,rcutoff2))
892             {
893
894             r33              = _mm_mul_ps(rsq33,rinv33);
895
896             /* EWALD ELECTROSTATICS */
897
898             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
899             ewrt             = _mm_mul_ps(r33,ewtabscale);
900             ewitab           = _mm_cvttps_epi32(ewrt);
901             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
902             ewitab           = _mm_slli_epi32(ewitab,2);
903             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
904             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
905             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
906             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
907             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
908             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
909             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
910             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
911             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
912
913             d                = _mm_sub_ps(r33,rswitch);
914             d                = _mm_max_ps(d,_mm_setzero_ps());
915             d2               = _mm_mul_ps(d,d);
916             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
917
918             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
919
920             /* Evaluate switch function */
921             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
922             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
923             velec            = _mm_mul_ps(velec,sw);
924             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
925
926             /* Update potential sum for this i atom from the interaction with this j atom. */
927             velec            = _mm_and_ps(velec,cutoff_mask);
928             velecsum         = _mm_add_ps(velecsum,velec);
929
930             fscal            = felec;
931
932             fscal            = _mm_and_ps(fscal,cutoff_mask);
933
934             /* Calculate temporary vectorial force */
935             tx               = _mm_mul_ps(fscal,dx33);
936             ty               = _mm_mul_ps(fscal,dy33);
937             tz               = _mm_mul_ps(fscal,dz33);
938
939             /* Update vectorial force */
940             fix3             = _mm_add_ps(fix3,tx);
941             fiy3             = _mm_add_ps(fiy3,ty);
942             fiz3             = _mm_add_ps(fiz3,tz);
943
944             fjx3             = _mm_add_ps(fjx3,tx);
945             fjy3             = _mm_add_ps(fjy3,ty);
946             fjz3             = _mm_add_ps(fjz3,tz);
947
948             }
949
950             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
951                                                    f+j_coord_offsetC,f+j_coord_offsetD,
952                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
953                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
954
955             /* Inner loop uses 647 flops */
956         }
957
958         if(jidx<j_index_end)
959         {
960
961             /* Get j neighbor index, and coordinate index */
962             jnrA             = jjnr[jidx];
963             jnrB             = jjnr[jidx+1];
964             jnrC             = jjnr[jidx+2];
965             jnrD             = jjnr[jidx+3];
966
967             /* Sign of each element will be negative for non-real atoms.
968              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
969              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
970              */
971             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
972             jnrA       = (jnrA>=0) ? jnrA : 0;
973             jnrB       = (jnrB>=0) ? jnrB : 0;
974             jnrC       = (jnrC>=0) ? jnrC : 0;
975             jnrD       = (jnrD>=0) ? jnrD : 0;
976
977             j_coord_offsetA  = DIM*jnrA;
978             j_coord_offsetB  = DIM*jnrB;
979             j_coord_offsetC  = DIM*jnrC;
980             j_coord_offsetD  = DIM*jnrD;
981
982             /* load j atom coordinates */
983             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
984                                               x+j_coord_offsetC,x+j_coord_offsetD,
985                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
986                                               &jy2,&jz2,&jx3,&jy3,&jz3);
987
988             /* Calculate displacement vector */
989             dx00             = _mm_sub_ps(ix0,jx0);
990             dy00             = _mm_sub_ps(iy0,jy0);
991             dz00             = _mm_sub_ps(iz0,jz0);
992             dx11             = _mm_sub_ps(ix1,jx1);
993             dy11             = _mm_sub_ps(iy1,jy1);
994             dz11             = _mm_sub_ps(iz1,jz1);
995             dx12             = _mm_sub_ps(ix1,jx2);
996             dy12             = _mm_sub_ps(iy1,jy2);
997             dz12             = _mm_sub_ps(iz1,jz2);
998             dx13             = _mm_sub_ps(ix1,jx3);
999             dy13             = _mm_sub_ps(iy1,jy3);
1000             dz13             = _mm_sub_ps(iz1,jz3);
1001             dx21             = _mm_sub_ps(ix2,jx1);
1002             dy21             = _mm_sub_ps(iy2,jy1);
1003             dz21             = _mm_sub_ps(iz2,jz1);
1004             dx22             = _mm_sub_ps(ix2,jx2);
1005             dy22             = _mm_sub_ps(iy2,jy2);
1006             dz22             = _mm_sub_ps(iz2,jz2);
1007             dx23             = _mm_sub_ps(ix2,jx3);
1008             dy23             = _mm_sub_ps(iy2,jy3);
1009             dz23             = _mm_sub_ps(iz2,jz3);
1010             dx31             = _mm_sub_ps(ix3,jx1);
1011             dy31             = _mm_sub_ps(iy3,jy1);
1012             dz31             = _mm_sub_ps(iz3,jz1);
1013             dx32             = _mm_sub_ps(ix3,jx2);
1014             dy32             = _mm_sub_ps(iy3,jy2);
1015             dz32             = _mm_sub_ps(iz3,jz2);
1016             dx33             = _mm_sub_ps(ix3,jx3);
1017             dy33             = _mm_sub_ps(iy3,jy3);
1018             dz33             = _mm_sub_ps(iz3,jz3);
1019
1020             /* Calculate squared distance and things based on it */
1021             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1022             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1023             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1024             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1025             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1026             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1027             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1028             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1029             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1030             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1031
1032             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1033             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1034             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1035             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1036             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1037             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1038             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1039             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1040             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1041             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1042
1043             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1044             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1045             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1046             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1047             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1048             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1049             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1050             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1051             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1052             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1053
1054             fjx0             = _mm_setzero_ps();
1055             fjy0             = _mm_setzero_ps();
1056             fjz0             = _mm_setzero_ps();
1057             fjx1             = _mm_setzero_ps();
1058             fjy1             = _mm_setzero_ps();
1059             fjz1             = _mm_setzero_ps();
1060             fjx2             = _mm_setzero_ps();
1061             fjy2             = _mm_setzero_ps();
1062             fjz2             = _mm_setzero_ps();
1063             fjx3             = _mm_setzero_ps();
1064             fjy3             = _mm_setzero_ps();
1065             fjz3             = _mm_setzero_ps();
1066
1067             /**************************
1068              * CALCULATE INTERACTIONS *
1069              **************************/
1070
1071             if (gmx_mm_any_lt(rsq00,rcutoff2))
1072             {
1073
1074             r00              = _mm_mul_ps(rsq00,rinv00);
1075             r00              = _mm_andnot_ps(dummy_mask,r00);
1076
1077             /* LENNARD-JONES DISPERSION/REPULSION */
1078
1079             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1080             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1081             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1082             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1083             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1084
1085             d                = _mm_sub_ps(r00,rswitch);
1086             d                = _mm_max_ps(d,_mm_setzero_ps());
1087             d2               = _mm_mul_ps(d,d);
1088             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1089
1090             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1091
1092             /* Evaluate switch function */
1093             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1094             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1095             vvdw             = _mm_mul_ps(vvdw,sw);
1096             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1097
1098             /* Update potential sum for this i atom from the interaction with this j atom. */
1099             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
1100             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
1101             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
1102
1103             fscal            = fvdw;
1104
1105             fscal            = _mm_and_ps(fscal,cutoff_mask);
1106
1107             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1108
1109             /* Calculate temporary vectorial force */
1110             tx               = _mm_mul_ps(fscal,dx00);
1111             ty               = _mm_mul_ps(fscal,dy00);
1112             tz               = _mm_mul_ps(fscal,dz00);
1113
1114             /* Update vectorial force */
1115             fix0             = _mm_add_ps(fix0,tx);
1116             fiy0             = _mm_add_ps(fiy0,ty);
1117             fiz0             = _mm_add_ps(fiz0,tz);
1118
1119             fjx0             = _mm_add_ps(fjx0,tx);
1120             fjy0             = _mm_add_ps(fjy0,ty);
1121             fjz0             = _mm_add_ps(fjz0,tz);
1122
1123             }
1124
1125             /**************************
1126              * CALCULATE INTERACTIONS *
1127              **************************/
1128
1129             if (gmx_mm_any_lt(rsq11,rcutoff2))
1130             {
1131
1132             r11              = _mm_mul_ps(rsq11,rinv11);
1133             r11              = _mm_andnot_ps(dummy_mask,r11);
1134
1135             /* EWALD ELECTROSTATICS */
1136
1137             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1138             ewrt             = _mm_mul_ps(r11,ewtabscale);
1139             ewitab           = _mm_cvttps_epi32(ewrt);
1140             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1141             ewitab           = _mm_slli_epi32(ewitab,2);
1142             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1143             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1144             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1145             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1146             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1147             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1148             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1149             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1150             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1151
1152             d                = _mm_sub_ps(r11,rswitch);
1153             d                = _mm_max_ps(d,_mm_setzero_ps());
1154             d2               = _mm_mul_ps(d,d);
1155             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1156
1157             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1158
1159             /* Evaluate switch function */
1160             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1161             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1162             velec            = _mm_mul_ps(velec,sw);
1163             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1164
1165             /* Update potential sum for this i atom from the interaction with this j atom. */
1166             velec            = _mm_and_ps(velec,cutoff_mask);
1167             velec            = _mm_andnot_ps(dummy_mask,velec);
1168             velecsum         = _mm_add_ps(velecsum,velec);
1169
1170             fscal            = felec;
1171
1172             fscal            = _mm_and_ps(fscal,cutoff_mask);
1173
1174             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1175
1176             /* Calculate temporary vectorial force */
1177             tx               = _mm_mul_ps(fscal,dx11);
1178             ty               = _mm_mul_ps(fscal,dy11);
1179             tz               = _mm_mul_ps(fscal,dz11);
1180
1181             /* Update vectorial force */
1182             fix1             = _mm_add_ps(fix1,tx);
1183             fiy1             = _mm_add_ps(fiy1,ty);
1184             fiz1             = _mm_add_ps(fiz1,tz);
1185
1186             fjx1             = _mm_add_ps(fjx1,tx);
1187             fjy1             = _mm_add_ps(fjy1,ty);
1188             fjz1             = _mm_add_ps(fjz1,tz);
1189
1190             }
1191
1192             /**************************
1193              * CALCULATE INTERACTIONS *
1194              **************************/
1195
1196             if (gmx_mm_any_lt(rsq12,rcutoff2))
1197             {
1198
1199             r12              = _mm_mul_ps(rsq12,rinv12);
1200             r12              = _mm_andnot_ps(dummy_mask,r12);
1201
1202             /* EWALD ELECTROSTATICS */
1203
1204             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1205             ewrt             = _mm_mul_ps(r12,ewtabscale);
1206             ewitab           = _mm_cvttps_epi32(ewrt);
1207             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1208             ewitab           = _mm_slli_epi32(ewitab,2);
1209             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1210             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1211             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1212             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1213             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1214             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1215             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1216             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1217             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1218
1219             d                = _mm_sub_ps(r12,rswitch);
1220             d                = _mm_max_ps(d,_mm_setzero_ps());
1221             d2               = _mm_mul_ps(d,d);
1222             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1223
1224             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1225
1226             /* Evaluate switch function */
1227             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1228             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1229             velec            = _mm_mul_ps(velec,sw);
1230             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1231
1232             /* Update potential sum for this i atom from the interaction with this j atom. */
1233             velec            = _mm_and_ps(velec,cutoff_mask);
1234             velec            = _mm_andnot_ps(dummy_mask,velec);
1235             velecsum         = _mm_add_ps(velecsum,velec);
1236
1237             fscal            = felec;
1238
1239             fscal            = _mm_and_ps(fscal,cutoff_mask);
1240
1241             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1242
1243             /* Calculate temporary vectorial force */
1244             tx               = _mm_mul_ps(fscal,dx12);
1245             ty               = _mm_mul_ps(fscal,dy12);
1246             tz               = _mm_mul_ps(fscal,dz12);
1247
1248             /* Update vectorial force */
1249             fix1             = _mm_add_ps(fix1,tx);
1250             fiy1             = _mm_add_ps(fiy1,ty);
1251             fiz1             = _mm_add_ps(fiz1,tz);
1252
1253             fjx2             = _mm_add_ps(fjx2,tx);
1254             fjy2             = _mm_add_ps(fjy2,ty);
1255             fjz2             = _mm_add_ps(fjz2,tz);
1256
1257             }
1258
1259             /**************************
1260              * CALCULATE INTERACTIONS *
1261              **************************/
1262
1263             if (gmx_mm_any_lt(rsq13,rcutoff2))
1264             {
1265
1266             r13              = _mm_mul_ps(rsq13,rinv13);
1267             r13              = _mm_andnot_ps(dummy_mask,r13);
1268
1269             /* EWALD ELECTROSTATICS */
1270
1271             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1272             ewrt             = _mm_mul_ps(r13,ewtabscale);
1273             ewitab           = _mm_cvttps_epi32(ewrt);
1274             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1275             ewitab           = _mm_slli_epi32(ewitab,2);
1276             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1277             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1278             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1279             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1280             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1281             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1282             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1283             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1284             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1285
1286             d                = _mm_sub_ps(r13,rswitch);
1287             d                = _mm_max_ps(d,_mm_setzero_ps());
1288             d2               = _mm_mul_ps(d,d);
1289             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1290
1291             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1292
1293             /* Evaluate switch function */
1294             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1295             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1296             velec            = _mm_mul_ps(velec,sw);
1297             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1298
1299             /* Update potential sum for this i atom from the interaction with this j atom. */
1300             velec            = _mm_and_ps(velec,cutoff_mask);
1301             velec            = _mm_andnot_ps(dummy_mask,velec);
1302             velecsum         = _mm_add_ps(velecsum,velec);
1303
1304             fscal            = felec;
1305
1306             fscal            = _mm_and_ps(fscal,cutoff_mask);
1307
1308             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1309
1310             /* Calculate temporary vectorial force */
1311             tx               = _mm_mul_ps(fscal,dx13);
1312             ty               = _mm_mul_ps(fscal,dy13);
1313             tz               = _mm_mul_ps(fscal,dz13);
1314
1315             /* Update vectorial force */
1316             fix1             = _mm_add_ps(fix1,tx);
1317             fiy1             = _mm_add_ps(fiy1,ty);
1318             fiz1             = _mm_add_ps(fiz1,tz);
1319
1320             fjx3             = _mm_add_ps(fjx3,tx);
1321             fjy3             = _mm_add_ps(fjy3,ty);
1322             fjz3             = _mm_add_ps(fjz3,tz);
1323
1324             }
1325
1326             /**************************
1327              * CALCULATE INTERACTIONS *
1328              **************************/
1329
1330             if (gmx_mm_any_lt(rsq21,rcutoff2))
1331             {
1332
1333             r21              = _mm_mul_ps(rsq21,rinv21);
1334             r21              = _mm_andnot_ps(dummy_mask,r21);
1335
1336             /* EWALD ELECTROSTATICS */
1337
1338             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1339             ewrt             = _mm_mul_ps(r21,ewtabscale);
1340             ewitab           = _mm_cvttps_epi32(ewrt);
1341             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1342             ewitab           = _mm_slli_epi32(ewitab,2);
1343             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1344             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1345             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1346             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1347             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1348             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1349             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1350             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1351             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1352
1353             d                = _mm_sub_ps(r21,rswitch);
1354             d                = _mm_max_ps(d,_mm_setzero_ps());
1355             d2               = _mm_mul_ps(d,d);
1356             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1357
1358             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1359
1360             /* Evaluate switch function */
1361             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1362             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1363             velec            = _mm_mul_ps(velec,sw);
1364             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1365
1366             /* Update potential sum for this i atom from the interaction with this j atom. */
1367             velec            = _mm_and_ps(velec,cutoff_mask);
1368             velec            = _mm_andnot_ps(dummy_mask,velec);
1369             velecsum         = _mm_add_ps(velecsum,velec);
1370
1371             fscal            = felec;
1372
1373             fscal            = _mm_and_ps(fscal,cutoff_mask);
1374
1375             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1376
1377             /* Calculate temporary vectorial force */
1378             tx               = _mm_mul_ps(fscal,dx21);
1379             ty               = _mm_mul_ps(fscal,dy21);
1380             tz               = _mm_mul_ps(fscal,dz21);
1381
1382             /* Update vectorial force */
1383             fix2             = _mm_add_ps(fix2,tx);
1384             fiy2             = _mm_add_ps(fiy2,ty);
1385             fiz2             = _mm_add_ps(fiz2,tz);
1386
1387             fjx1             = _mm_add_ps(fjx1,tx);
1388             fjy1             = _mm_add_ps(fjy1,ty);
1389             fjz1             = _mm_add_ps(fjz1,tz);
1390
1391             }
1392
1393             /**************************
1394              * CALCULATE INTERACTIONS *
1395              **************************/
1396
1397             if (gmx_mm_any_lt(rsq22,rcutoff2))
1398             {
1399
1400             r22              = _mm_mul_ps(rsq22,rinv22);
1401             r22              = _mm_andnot_ps(dummy_mask,r22);
1402
1403             /* EWALD ELECTROSTATICS */
1404
1405             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1406             ewrt             = _mm_mul_ps(r22,ewtabscale);
1407             ewitab           = _mm_cvttps_epi32(ewrt);
1408             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1409             ewitab           = _mm_slli_epi32(ewitab,2);
1410             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1411             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1412             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1413             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1414             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1415             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1416             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1417             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1418             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1419
1420             d                = _mm_sub_ps(r22,rswitch);
1421             d                = _mm_max_ps(d,_mm_setzero_ps());
1422             d2               = _mm_mul_ps(d,d);
1423             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1424
1425             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1426
1427             /* Evaluate switch function */
1428             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1429             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1430             velec            = _mm_mul_ps(velec,sw);
1431             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1432
1433             /* Update potential sum for this i atom from the interaction with this j atom. */
1434             velec            = _mm_and_ps(velec,cutoff_mask);
1435             velec            = _mm_andnot_ps(dummy_mask,velec);
1436             velecsum         = _mm_add_ps(velecsum,velec);
1437
1438             fscal            = felec;
1439
1440             fscal            = _mm_and_ps(fscal,cutoff_mask);
1441
1442             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1443
1444             /* Calculate temporary vectorial force */
1445             tx               = _mm_mul_ps(fscal,dx22);
1446             ty               = _mm_mul_ps(fscal,dy22);
1447             tz               = _mm_mul_ps(fscal,dz22);
1448
1449             /* Update vectorial force */
1450             fix2             = _mm_add_ps(fix2,tx);
1451             fiy2             = _mm_add_ps(fiy2,ty);
1452             fiz2             = _mm_add_ps(fiz2,tz);
1453
1454             fjx2             = _mm_add_ps(fjx2,tx);
1455             fjy2             = _mm_add_ps(fjy2,ty);
1456             fjz2             = _mm_add_ps(fjz2,tz);
1457
1458             }
1459
1460             /**************************
1461              * CALCULATE INTERACTIONS *
1462              **************************/
1463
1464             if (gmx_mm_any_lt(rsq23,rcutoff2))
1465             {
1466
1467             r23              = _mm_mul_ps(rsq23,rinv23);
1468             r23              = _mm_andnot_ps(dummy_mask,r23);
1469
1470             /* EWALD ELECTROSTATICS */
1471
1472             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1473             ewrt             = _mm_mul_ps(r23,ewtabscale);
1474             ewitab           = _mm_cvttps_epi32(ewrt);
1475             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1476             ewitab           = _mm_slli_epi32(ewitab,2);
1477             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1478             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1479             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1480             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1481             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1482             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1483             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1484             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1485             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1486
1487             d                = _mm_sub_ps(r23,rswitch);
1488             d                = _mm_max_ps(d,_mm_setzero_ps());
1489             d2               = _mm_mul_ps(d,d);
1490             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1491
1492             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1493
1494             /* Evaluate switch function */
1495             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1496             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1497             velec            = _mm_mul_ps(velec,sw);
1498             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1499
1500             /* Update potential sum for this i atom from the interaction with this j atom. */
1501             velec            = _mm_and_ps(velec,cutoff_mask);
1502             velec            = _mm_andnot_ps(dummy_mask,velec);
1503             velecsum         = _mm_add_ps(velecsum,velec);
1504
1505             fscal            = felec;
1506
1507             fscal            = _mm_and_ps(fscal,cutoff_mask);
1508
1509             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1510
1511             /* Calculate temporary vectorial force */
1512             tx               = _mm_mul_ps(fscal,dx23);
1513             ty               = _mm_mul_ps(fscal,dy23);
1514             tz               = _mm_mul_ps(fscal,dz23);
1515
1516             /* Update vectorial force */
1517             fix2             = _mm_add_ps(fix2,tx);
1518             fiy2             = _mm_add_ps(fiy2,ty);
1519             fiz2             = _mm_add_ps(fiz2,tz);
1520
1521             fjx3             = _mm_add_ps(fjx3,tx);
1522             fjy3             = _mm_add_ps(fjy3,ty);
1523             fjz3             = _mm_add_ps(fjz3,tz);
1524
1525             }
1526
1527             /**************************
1528              * CALCULATE INTERACTIONS *
1529              **************************/
1530
1531             if (gmx_mm_any_lt(rsq31,rcutoff2))
1532             {
1533
1534             r31              = _mm_mul_ps(rsq31,rinv31);
1535             r31              = _mm_andnot_ps(dummy_mask,r31);
1536
1537             /* EWALD ELECTROSTATICS */
1538
1539             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1540             ewrt             = _mm_mul_ps(r31,ewtabscale);
1541             ewitab           = _mm_cvttps_epi32(ewrt);
1542             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1543             ewitab           = _mm_slli_epi32(ewitab,2);
1544             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1545             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1546             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1547             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1548             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1549             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1550             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1551             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1552             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1553
1554             d                = _mm_sub_ps(r31,rswitch);
1555             d                = _mm_max_ps(d,_mm_setzero_ps());
1556             d2               = _mm_mul_ps(d,d);
1557             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1558
1559             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1560
1561             /* Evaluate switch function */
1562             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1563             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1564             velec            = _mm_mul_ps(velec,sw);
1565             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1566
1567             /* Update potential sum for this i atom from the interaction with this j atom. */
1568             velec            = _mm_and_ps(velec,cutoff_mask);
1569             velec            = _mm_andnot_ps(dummy_mask,velec);
1570             velecsum         = _mm_add_ps(velecsum,velec);
1571
1572             fscal            = felec;
1573
1574             fscal            = _mm_and_ps(fscal,cutoff_mask);
1575
1576             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1577
1578             /* Calculate temporary vectorial force */
1579             tx               = _mm_mul_ps(fscal,dx31);
1580             ty               = _mm_mul_ps(fscal,dy31);
1581             tz               = _mm_mul_ps(fscal,dz31);
1582
1583             /* Update vectorial force */
1584             fix3             = _mm_add_ps(fix3,tx);
1585             fiy3             = _mm_add_ps(fiy3,ty);
1586             fiz3             = _mm_add_ps(fiz3,tz);
1587
1588             fjx1             = _mm_add_ps(fjx1,tx);
1589             fjy1             = _mm_add_ps(fjy1,ty);
1590             fjz1             = _mm_add_ps(fjz1,tz);
1591
1592             }
1593
1594             /**************************
1595              * CALCULATE INTERACTIONS *
1596              **************************/
1597
1598             if (gmx_mm_any_lt(rsq32,rcutoff2))
1599             {
1600
1601             r32              = _mm_mul_ps(rsq32,rinv32);
1602             r32              = _mm_andnot_ps(dummy_mask,r32);
1603
1604             /* EWALD ELECTROSTATICS */
1605
1606             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1607             ewrt             = _mm_mul_ps(r32,ewtabscale);
1608             ewitab           = _mm_cvttps_epi32(ewrt);
1609             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1610             ewitab           = _mm_slli_epi32(ewitab,2);
1611             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1612             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1613             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1614             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1615             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1616             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1617             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1618             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1619             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1620
1621             d                = _mm_sub_ps(r32,rswitch);
1622             d                = _mm_max_ps(d,_mm_setzero_ps());
1623             d2               = _mm_mul_ps(d,d);
1624             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1625
1626             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1627
1628             /* Evaluate switch function */
1629             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1630             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1631             velec            = _mm_mul_ps(velec,sw);
1632             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1633
1634             /* Update potential sum for this i atom from the interaction with this j atom. */
1635             velec            = _mm_and_ps(velec,cutoff_mask);
1636             velec            = _mm_andnot_ps(dummy_mask,velec);
1637             velecsum         = _mm_add_ps(velecsum,velec);
1638
1639             fscal            = felec;
1640
1641             fscal            = _mm_and_ps(fscal,cutoff_mask);
1642
1643             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1644
1645             /* Calculate temporary vectorial force */
1646             tx               = _mm_mul_ps(fscal,dx32);
1647             ty               = _mm_mul_ps(fscal,dy32);
1648             tz               = _mm_mul_ps(fscal,dz32);
1649
1650             /* Update vectorial force */
1651             fix3             = _mm_add_ps(fix3,tx);
1652             fiy3             = _mm_add_ps(fiy3,ty);
1653             fiz3             = _mm_add_ps(fiz3,tz);
1654
1655             fjx2             = _mm_add_ps(fjx2,tx);
1656             fjy2             = _mm_add_ps(fjy2,ty);
1657             fjz2             = _mm_add_ps(fjz2,tz);
1658
1659             }
1660
1661             /**************************
1662              * CALCULATE INTERACTIONS *
1663              **************************/
1664
1665             if (gmx_mm_any_lt(rsq33,rcutoff2))
1666             {
1667
1668             r33              = _mm_mul_ps(rsq33,rinv33);
1669             r33              = _mm_andnot_ps(dummy_mask,r33);
1670
1671             /* EWALD ELECTROSTATICS */
1672
1673             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1674             ewrt             = _mm_mul_ps(r33,ewtabscale);
1675             ewitab           = _mm_cvttps_epi32(ewrt);
1676             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1677             ewitab           = _mm_slli_epi32(ewitab,2);
1678             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1679             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1680             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1681             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1682             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1683             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1684             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1685             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1686             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1687
1688             d                = _mm_sub_ps(r33,rswitch);
1689             d                = _mm_max_ps(d,_mm_setzero_ps());
1690             d2               = _mm_mul_ps(d,d);
1691             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1692
1693             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1694
1695             /* Evaluate switch function */
1696             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1697             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1698             velec            = _mm_mul_ps(velec,sw);
1699             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1700
1701             /* Update potential sum for this i atom from the interaction with this j atom. */
1702             velec            = _mm_and_ps(velec,cutoff_mask);
1703             velec            = _mm_andnot_ps(dummy_mask,velec);
1704             velecsum         = _mm_add_ps(velecsum,velec);
1705
1706             fscal            = felec;
1707
1708             fscal            = _mm_and_ps(fscal,cutoff_mask);
1709
1710             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1711
1712             /* Calculate temporary vectorial force */
1713             tx               = _mm_mul_ps(fscal,dx33);
1714             ty               = _mm_mul_ps(fscal,dy33);
1715             tz               = _mm_mul_ps(fscal,dz33);
1716
1717             /* Update vectorial force */
1718             fix3             = _mm_add_ps(fix3,tx);
1719             fiy3             = _mm_add_ps(fiy3,ty);
1720             fiz3             = _mm_add_ps(fiz3,tz);
1721
1722             fjx3             = _mm_add_ps(fjx3,tx);
1723             fjy3             = _mm_add_ps(fjy3,ty);
1724             fjz3             = _mm_add_ps(fjz3,tz);
1725
1726             }
1727
1728             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
1729                                                    f+j_coord_offsetC,f+j_coord_offsetD,
1730                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1731                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1732
1733             /* Inner loop uses 657 flops */
1734         }
1735
1736         /* End of innermost loop */
1737
1738         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1739                                               f+i_coord_offset,fshift+i_shift_offset);
1740
1741         ggid                        = gid[iidx];
1742         /* Update potential energies */
1743         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1744         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1745
1746         /* Increment number of inner iterations */
1747         inneriter                  += j_index_end - j_index_start;
1748
1749         /* Outer loop uses 38 flops */
1750     }
1751
1752     /* Increment number of outer iterations */
1753     outeriter        += nri;
1754
1755     /* Update outer/inner flops */
1756
1757     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*38 + inneriter*657);
1758 }
1759 /*
1760  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single
1761  * Electrostatics interaction: Ewald
1762  * VdW interaction:            LennardJones
1763  * Geometry:                   Water4-Water4
1764  * Calculate force/pot:        Force
1765  */
1766 void
1767 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single
1768                     (t_nblist * gmx_restrict                nlist,
1769                      rvec * gmx_restrict                    xx,
1770                      rvec * gmx_restrict                    ff,
1771                      t_forcerec * gmx_restrict              fr,
1772                      t_mdatoms * gmx_restrict               mdatoms,
1773                      nb_kernel_data_t * gmx_restrict        kernel_data,
1774                      t_nrnb * gmx_restrict                  nrnb)
1775 {
1776     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1777      * just 0 for non-waters.
1778      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1779      * jnr indices corresponding to data put in the four positions in the SIMD register.
1780      */
1781     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1782     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1783     int              jnrA,jnrB,jnrC,jnrD;
1784     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1785     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1786     real             shX,shY,shZ,rcutoff_scalar;
1787     real             *shiftvec,*fshift,*x,*f;
1788     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1789     int              vdwioffset0;
1790     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1791     int              vdwioffset1;
1792     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1793     int              vdwioffset2;
1794     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1795     int              vdwioffset3;
1796     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1797     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1798     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1799     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1800     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1801     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1802     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1803     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1804     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1805     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1806     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1807     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1808     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1809     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1810     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1811     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1812     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1813     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1814     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1815     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1816     real             *charge;
1817     int              nvdwtype;
1818     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1819     int              *vdwtype;
1820     real             *vdwparam;
1821     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1822     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1823     __m128i          ewitab;
1824     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1825     real             *ewtab;
1826     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1827     real             rswitch_scalar,d_scalar;
1828     __m128           dummy_mask,cutoff_mask;
1829     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1830     __m128           one     = _mm_set1_ps(1.0);
1831     __m128           two     = _mm_set1_ps(2.0);
1832     x                = xx[0];
1833     f                = ff[0];
1834
1835     nri              = nlist->nri;
1836     iinr             = nlist->iinr;
1837     jindex           = nlist->jindex;
1838     jjnr             = nlist->jjnr;
1839     shiftidx         = nlist->shift;
1840     gid              = nlist->gid;
1841     shiftvec         = fr->shift_vec[0];
1842     fshift           = fr->fshift[0];
1843     facel            = _mm_set1_ps(fr->epsfac);
1844     charge           = mdatoms->chargeA;
1845     nvdwtype         = fr->ntype;
1846     vdwparam         = fr->nbfp;
1847     vdwtype          = mdatoms->typeA;
1848
1849     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1850     ewtab            = fr->ic->tabq_coul_FDV0;
1851     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1852     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1853
1854     /* Setup water-specific parameters */
1855     inr              = nlist->iinr[0];
1856     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1857     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1858     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1859     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1860
1861     jq1              = _mm_set1_ps(charge[inr+1]);
1862     jq2              = _mm_set1_ps(charge[inr+2]);
1863     jq3              = _mm_set1_ps(charge[inr+3]);
1864     vdwjidx0A        = 2*vdwtype[inr+0];
1865     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1866     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1867     qq11             = _mm_mul_ps(iq1,jq1);
1868     qq12             = _mm_mul_ps(iq1,jq2);
1869     qq13             = _mm_mul_ps(iq1,jq3);
1870     qq21             = _mm_mul_ps(iq2,jq1);
1871     qq22             = _mm_mul_ps(iq2,jq2);
1872     qq23             = _mm_mul_ps(iq2,jq3);
1873     qq31             = _mm_mul_ps(iq3,jq1);
1874     qq32             = _mm_mul_ps(iq3,jq2);
1875     qq33             = _mm_mul_ps(iq3,jq3);
1876
1877     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1878     rcutoff_scalar   = fr->rcoulomb;
1879     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1880     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1881
1882     rswitch_scalar   = fr->rcoulomb_switch;
1883     rswitch          = _mm_set1_ps(rswitch_scalar);
1884     /* Setup switch parameters */
1885     d_scalar         = rcutoff_scalar-rswitch_scalar;
1886     d                = _mm_set1_ps(d_scalar);
1887     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1888     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1889     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1890     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1891     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1892     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1893
1894     /* Avoid stupid compiler warnings */
1895     jnrA = jnrB = jnrC = jnrD = 0;
1896     j_coord_offsetA = 0;
1897     j_coord_offsetB = 0;
1898     j_coord_offsetC = 0;
1899     j_coord_offsetD = 0;
1900
1901     outeriter        = 0;
1902     inneriter        = 0;
1903
1904     /* Start outer loop over neighborlists */
1905     for(iidx=0; iidx<nri; iidx++)
1906     {
1907         /* Load shift vector for this list */
1908         i_shift_offset   = DIM*shiftidx[iidx];
1909         shX              = shiftvec[i_shift_offset+XX];
1910         shY              = shiftvec[i_shift_offset+YY];
1911         shZ              = shiftvec[i_shift_offset+ZZ];
1912
1913         /* Load limits for loop over neighbors */
1914         j_index_start    = jindex[iidx];
1915         j_index_end      = jindex[iidx+1];
1916
1917         /* Get outer coordinate index */
1918         inr              = iinr[iidx];
1919         i_coord_offset   = DIM*inr;
1920
1921         /* Load i particle coords and add shift vector */
1922         ix0              = _mm_set1_ps(shX + x[i_coord_offset+DIM*0+XX]);
1923         iy0              = _mm_set1_ps(shY + x[i_coord_offset+DIM*0+YY]);
1924         iz0              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*0+ZZ]);
1925         ix1              = _mm_set1_ps(shX + x[i_coord_offset+DIM*1+XX]);
1926         iy1              = _mm_set1_ps(shY + x[i_coord_offset+DIM*1+YY]);
1927         iz1              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*1+ZZ]);
1928         ix2              = _mm_set1_ps(shX + x[i_coord_offset+DIM*2+XX]);
1929         iy2              = _mm_set1_ps(shY + x[i_coord_offset+DIM*2+YY]);
1930         iz2              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*2+ZZ]);
1931         ix3              = _mm_set1_ps(shX + x[i_coord_offset+DIM*3+XX]);
1932         iy3              = _mm_set1_ps(shY + x[i_coord_offset+DIM*3+YY]);
1933         iz3              = _mm_set1_ps(shZ + x[i_coord_offset+DIM*3+ZZ]);
1934
1935         fix0             = _mm_setzero_ps();
1936         fiy0             = _mm_setzero_ps();
1937         fiz0             = _mm_setzero_ps();
1938         fix1             = _mm_setzero_ps();
1939         fiy1             = _mm_setzero_ps();
1940         fiz1             = _mm_setzero_ps();
1941         fix2             = _mm_setzero_ps();
1942         fiy2             = _mm_setzero_ps();
1943         fiz2             = _mm_setzero_ps();
1944         fix3             = _mm_setzero_ps();
1945         fiy3             = _mm_setzero_ps();
1946         fiz3             = _mm_setzero_ps();
1947
1948         /* Start inner kernel loop */
1949         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1950         {
1951
1952             /* Get j neighbor index, and coordinate index */
1953             jnrA             = jjnr[jidx];
1954             jnrB             = jjnr[jidx+1];
1955             jnrC             = jjnr[jidx+2];
1956             jnrD             = jjnr[jidx+3];
1957
1958             j_coord_offsetA  = DIM*jnrA;
1959             j_coord_offsetB  = DIM*jnrB;
1960             j_coord_offsetC  = DIM*jnrC;
1961             j_coord_offsetD  = DIM*jnrD;
1962
1963             /* load j atom coordinates */
1964             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1965                                               x+j_coord_offsetC,x+j_coord_offsetD,
1966                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1967                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1968
1969             /* Calculate displacement vector */
1970             dx00             = _mm_sub_ps(ix0,jx0);
1971             dy00             = _mm_sub_ps(iy0,jy0);
1972             dz00             = _mm_sub_ps(iz0,jz0);
1973             dx11             = _mm_sub_ps(ix1,jx1);
1974             dy11             = _mm_sub_ps(iy1,jy1);
1975             dz11             = _mm_sub_ps(iz1,jz1);
1976             dx12             = _mm_sub_ps(ix1,jx2);
1977             dy12             = _mm_sub_ps(iy1,jy2);
1978             dz12             = _mm_sub_ps(iz1,jz2);
1979             dx13             = _mm_sub_ps(ix1,jx3);
1980             dy13             = _mm_sub_ps(iy1,jy3);
1981             dz13             = _mm_sub_ps(iz1,jz3);
1982             dx21             = _mm_sub_ps(ix2,jx1);
1983             dy21             = _mm_sub_ps(iy2,jy1);
1984             dz21             = _mm_sub_ps(iz2,jz1);
1985             dx22             = _mm_sub_ps(ix2,jx2);
1986             dy22             = _mm_sub_ps(iy2,jy2);
1987             dz22             = _mm_sub_ps(iz2,jz2);
1988             dx23             = _mm_sub_ps(ix2,jx3);
1989             dy23             = _mm_sub_ps(iy2,jy3);
1990             dz23             = _mm_sub_ps(iz2,jz3);
1991             dx31             = _mm_sub_ps(ix3,jx1);
1992             dy31             = _mm_sub_ps(iy3,jy1);
1993             dz31             = _mm_sub_ps(iz3,jz1);
1994             dx32             = _mm_sub_ps(ix3,jx2);
1995             dy32             = _mm_sub_ps(iy3,jy2);
1996             dz32             = _mm_sub_ps(iz3,jz2);
1997             dx33             = _mm_sub_ps(ix3,jx3);
1998             dy33             = _mm_sub_ps(iy3,jy3);
1999             dz33             = _mm_sub_ps(iz3,jz3);
2000
2001             /* Calculate squared distance and things based on it */
2002             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2003             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2004             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2005             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2006             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2007             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2008             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2009             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2010             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2011             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2012
2013             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2014             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2015             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2016             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2017             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2018             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2019             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2020             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2021             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2022             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2023
2024             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2025             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2026             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2027             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2028             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2029             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2030             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2031             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2032             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2033             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2034
2035             fjx0             = _mm_setzero_ps();
2036             fjy0             = _mm_setzero_ps();
2037             fjz0             = _mm_setzero_ps();
2038             fjx1             = _mm_setzero_ps();
2039             fjy1             = _mm_setzero_ps();
2040             fjz1             = _mm_setzero_ps();
2041             fjx2             = _mm_setzero_ps();
2042             fjy2             = _mm_setzero_ps();
2043             fjz2             = _mm_setzero_ps();
2044             fjx3             = _mm_setzero_ps();
2045             fjy3             = _mm_setzero_ps();
2046             fjz3             = _mm_setzero_ps();
2047
2048             /**************************
2049              * CALCULATE INTERACTIONS *
2050              **************************/
2051
2052             if (gmx_mm_any_lt(rsq00,rcutoff2))
2053             {
2054
2055             r00              = _mm_mul_ps(rsq00,rinv00);
2056
2057             /* LENNARD-JONES DISPERSION/REPULSION */
2058
2059             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2060             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
2061             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2062             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2063             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2064
2065             d                = _mm_sub_ps(r00,rswitch);
2066             d                = _mm_max_ps(d,_mm_setzero_ps());
2067             d2               = _mm_mul_ps(d,d);
2068             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2069
2070             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2071
2072             /* Evaluate switch function */
2073             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2074             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2075             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2076
2077             fscal            = fvdw;
2078
2079             fscal            = _mm_and_ps(fscal,cutoff_mask);
2080
2081             /* Calculate temporary vectorial force */
2082             tx               = _mm_mul_ps(fscal,dx00);
2083             ty               = _mm_mul_ps(fscal,dy00);
2084             tz               = _mm_mul_ps(fscal,dz00);
2085
2086             /* Update vectorial force */
2087             fix0             = _mm_add_ps(fix0,tx);
2088             fiy0             = _mm_add_ps(fiy0,ty);
2089             fiz0             = _mm_add_ps(fiz0,tz);
2090
2091             fjx0             = _mm_add_ps(fjx0,tx);
2092             fjy0             = _mm_add_ps(fjy0,ty);
2093             fjz0             = _mm_add_ps(fjz0,tz);
2094
2095             }
2096
2097             /**************************
2098              * CALCULATE INTERACTIONS *
2099              **************************/
2100
2101             if (gmx_mm_any_lt(rsq11,rcutoff2))
2102             {
2103
2104             r11              = _mm_mul_ps(rsq11,rinv11);
2105
2106             /* EWALD ELECTROSTATICS */
2107
2108             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2109             ewrt             = _mm_mul_ps(r11,ewtabscale);
2110             ewitab           = _mm_cvttps_epi32(ewrt);
2111             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2112             ewitab           = _mm_slli_epi32(ewitab,2);
2113             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2114             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2115             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2116             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2117             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2118             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2119             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2120             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2121             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2122
2123             d                = _mm_sub_ps(r11,rswitch);
2124             d                = _mm_max_ps(d,_mm_setzero_ps());
2125             d2               = _mm_mul_ps(d,d);
2126             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2127
2128             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2129
2130             /* Evaluate switch function */
2131             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2132             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2133             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2134
2135             fscal            = felec;
2136
2137             fscal            = _mm_and_ps(fscal,cutoff_mask);
2138
2139             /* Calculate temporary vectorial force */
2140             tx               = _mm_mul_ps(fscal,dx11);
2141             ty               = _mm_mul_ps(fscal,dy11);
2142             tz               = _mm_mul_ps(fscal,dz11);
2143
2144             /* Update vectorial force */
2145             fix1             = _mm_add_ps(fix1,tx);
2146             fiy1             = _mm_add_ps(fiy1,ty);
2147             fiz1             = _mm_add_ps(fiz1,tz);
2148
2149             fjx1             = _mm_add_ps(fjx1,tx);
2150             fjy1             = _mm_add_ps(fjy1,ty);
2151             fjz1             = _mm_add_ps(fjz1,tz);
2152
2153             }
2154
2155             /**************************
2156              * CALCULATE INTERACTIONS *
2157              **************************/
2158
2159             if (gmx_mm_any_lt(rsq12,rcutoff2))
2160             {
2161
2162             r12              = _mm_mul_ps(rsq12,rinv12);
2163
2164             /* EWALD ELECTROSTATICS */
2165
2166             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2167             ewrt             = _mm_mul_ps(r12,ewtabscale);
2168             ewitab           = _mm_cvttps_epi32(ewrt);
2169             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2170             ewitab           = _mm_slli_epi32(ewitab,2);
2171             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2172             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2173             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2174             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2175             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2176             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2177             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2178             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2179             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2180
2181             d                = _mm_sub_ps(r12,rswitch);
2182             d                = _mm_max_ps(d,_mm_setzero_ps());
2183             d2               = _mm_mul_ps(d,d);
2184             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2185
2186             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2187
2188             /* Evaluate switch function */
2189             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2190             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2191             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2192
2193             fscal            = felec;
2194
2195             fscal            = _mm_and_ps(fscal,cutoff_mask);
2196
2197             /* Calculate temporary vectorial force */
2198             tx               = _mm_mul_ps(fscal,dx12);
2199             ty               = _mm_mul_ps(fscal,dy12);
2200             tz               = _mm_mul_ps(fscal,dz12);
2201
2202             /* Update vectorial force */
2203             fix1             = _mm_add_ps(fix1,tx);
2204             fiy1             = _mm_add_ps(fiy1,ty);
2205             fiz1             = _mm_add_ps(fiz1,tz);
2206
2207             fjx2             = _mm_add_ps(fjx2,tx);
2208             fjy2             = _mm_add_ps(fjy2,ty);
2209             fjz2             = _mm_add_ps(fjz2,tz);
2210
2211             }
2212
2213             /**************************
2214              * CALCULATE INTERACTIONS *
2215              **************************/
2216
2217             if (gmx_mm_any_lt(rsq13,rcutoff2))
2218             {
2219
2220             r13              = _mm_mul_ps(rsq13,rinv13);
2221
2222             /* EWALD ELECTROSTATICS */
2223
2224             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2225             ewrt             = _mm_mul_ps(r13,ewtabscale);
2226             ewitab           = _mm_cvttps_epi32(ewrt);
2227             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2228             ewitab           = _mm_slli_epi32(ewitab,2);
2229             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2230             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2231             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2232             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2233             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2234             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2235             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2236             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2237             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2238
2239             d                = _mm_sub_ps(r13,rswitch);
2240             d                = _mm_max_ps(d,_mm_setzero_ps());
2241             d2               = _mm_mul_ps(d,d);
2242             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2243
2244             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2245
2246             /* Evaluate switch function */
2247             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2248             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2249             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2250
2251             fscal            = felec;
2252
2253             fscal            = _mm_and_ps(fscal,cutoff_mask);
2254
2255             /* Calculate temporary vectorial force */
2256             tx               = _mm_mul_ps(fscal,dx13);
2257             ty               = _mm_mul_ps(fscal,dy13);
2258             tz               = _mm_mul_ps(fscal,dz13);
2259
2260             /* Update vectorial force */
2261             fix1             = _mm_add_ps(fix1,tx);
2262             fiy1             = _mm_add_ps(fiy1,ty);
2263             fiz1             = _mm_add_ps(fiz1,tz);
2264
2265             fjx3             = _mm_add_ps(fjx3,tx);
2266             fjy3             = _mm_add_ps(fjy3,ty);
2267             fjz3             = _mm_add_ps(fjz3,tz);
2268
2269             }
2270
2271             /**************************
2272              * CALCULATE INTERACTIONS *
2273              **************************/
2274
2275             if (gmx_mm_any_lt(rsq21,rcutoff2))
2276             {
2277
2278             r21              = _mm_mul_ps(rsq21,rinv21);
2279
2280             /* EWALD ELECTROSTATICS */
2281
2282             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2283             ewrt             = _mm_mul_ps(r21,ewtabscale);
2284             ewitab           = _mm_cvttps_epi32(ewrt);
2285             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2286             ewitab           = _mm_slli_epi32(ewitab,2);
2287             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2288             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2289             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2290             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2291             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2292             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2293             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2294             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2295             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2296
2297             d                = _mm_sub_ps(r21,rswitch);
2298             d                = _mm_max_ps(d,_mm_setzero_ps());
2299             d2               = _mm_mul_ps(d,d);
2300             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2301
2302             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2303
2304             /* Evaluate switch function */
2305             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2306             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2307             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2308
2309             fscal            = felec;
2310
2311             fscal            = _mm_and_ps(fscal,cutoff_mask);
2312
2313             /* Calculate temporary vectorial force */
2314             tx               = _mm_mul_ps(fscal,dx21);
2315             ty               = _mm_mul_ps(fscal,dy21);
2316             tz               = _mm_mul_ps(fscal,dz21);
2317
2318             /* Update vectorial force */
2319             fix2             = _mm_add_ps(fix2,tx);
2320             fiy2             = _mm_add_ps(fiy2,ty);
2321             fiz2             = _mm_add_ps(fiz2,tz);
2322
2323             fjx1             = _mm_add_ps(fjx1,tx);
2324             fjy1             = _mm_add_ps(fjy1,ty);
2325             fjz1             = _mm_add_ps(fjz1,tz);
2326
2327             }
2328
2329             /**************************
2330              * CALCULATE INTERACTIONS *
2331              **************************/
2332
2333             if (gmx_mm_any_lt(rsq22,rcutoff2))
2334             {
2335
2336             r22              = _mm_mul_ps(rsq22,rinv22);
2337
2338             /* EWALD ELECTROSTATICS */
2339
2340             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2341             ewrt             = _mm_mul_ps(r22,ewtabscale);
2342             ewitab           = _mm_cvttps_epi32(ewrt);
2343             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2344             ewitab           = _mm_slli_epi32(ewitab,2);
2345             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2346             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2347             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2348             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2349             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2350             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2351             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2352             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2353             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2354
2355             d                = _mm_sub_ps(r22,rswitch);
2356             d                = _mm_max_ps(d,_mm_setzero_ps());
2357             d2               = _mm_mul_ps(d,d);
2358             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2359
2360             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2361
2362             /* Evaluate switch function */
2363             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2364             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2365             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2366
2367             fscal            = felec;
2368
2369             fscal            = _mm_and_ps(fscal,cutoff_mask);
2370
2371             /* Calculate temporary vectorial force */
2372             tx               = _mm_mul_ps(fscal,dx22);
2373             ty               = _mm_mul_ps(fscal,dy22);
2374             tz               = _mm_mul_ps(fscal,dz22);
2375
2376             /* Update vectorial force */
2377             fix2             = _mm_add_ps(fix2,tx);
2378             fiy2             = _mm_add_ps(fiy2,ty);
2379             fiz2             = _mm_add_ps(fiz2,tz);
2380
2381             fjx2             = _mm_add_ps(fjx2,tx);
2382             fjy2             = _mm_add_ps(fjy2,ty);
2383             fjz2             = _mm_add_ps(fjz2,tz);
2384
2385             }
2386
2387             /**************************
2388              * CALCULATE INTERACTIONS *
2389              **************************/
2390
2391             if (gmx_mm_any_lt(rsq23,rcutoff2))
2392             {
2393
2394             r23              = _mm_mul_ps(rsq23,rinv23);
2395
2396             /* EWALD ELECTROSTATICS */
2397
2398             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2399             ewrt             = _mm_mul_ps(r23,ewtabscale);
2400             ewitab           = _mm_cvttps_epi32(ewrt);
2401             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2402             ewitab           = _mm_slli_epi32(ewitab,2);
2403             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2404             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2405             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2406             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2407             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2408             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2409             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2410             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2411             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2412
2413             d                = _mm_sub_ps(r23,rswitch);
2414             d                = _mm_max_ps(d,_mm_setzero_ps());
2415             d2               = _mm_mul_ps(d,d);
2416             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2417
2418             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2419
2420             /* Evaluate switch function */
2421             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2422             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2423             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2424
2425             fscal            = felec;
2426
2427             fscal            = _mm_and_ps(fscal,cutoff_mask);
2428
2429             /* Calculate temporary vectorial force */
2430             tx               = _mm_mul_ps(fscal,dx23);
2431             ty               = _mm_mul_ps(fscal,dy23);
2432             tz               = _mm_mul_ps(fscal,dz23);
2433
2434             /* Update vectorial force */
2435             fix2             = _mm_add_ps(fix2,tx);
2436             fiy2             = _mm_add_ps(fiy2,ty);
2437             fiz2             = _mm_add_ps(fiz2,tz);
2438
2439             fjx3             = _mm_add_ps(fjx3,tx);
2440             fjy3             = _mm_add_ps(fjy3,ty);
2441             fjz3             = _mm_add_ps(fjz3,tz);
2442
2443             }
2444
2445             /**************************
2446              * CALCULATE INTERACTIONS *
2447              **************************/
2448
2449             if (gmx_mm_any_lt(rsq31,rcutoff2))
2450             {
2451
2452             r31              = _mm_mul_ps(rsq31,rinv31);
2453
2454             /* EWALD ELECTROSTATICS */
2455
2456             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2457             ewrt             = _mm_mul_ps(r31,ewtabscale);
2458             ewitab           = _mm_cvttps_epi32(ewrt);
2459             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2460             ewitab           = _mm_slli_epi32(ewitab,2);
2461             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2462             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2463             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2464             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2465             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2466             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2467             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2468             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2469             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2470
2471             d                = _mm_sub_ps(r31,rswitch);
2472             d                = _mm_max_ps(d,_mm_setzero_ps());
2473             d2               = _mm_mul_ps(d,d);
2474             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2475
2476             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2477
2478             /* Evaluate switch function */
2479             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2480             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2481             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2482
2483             fscal            = felec;
2484
2485             fscal            = _mm_and_ps(fscal,cutoff_mask);
2486
2487             /* Calculate temporary vectorial force */
2488             tx               = _mm_mul_ps(fscal,dx31);
2489             ty               = _mm_mul_ps(fscal,dy31);
2490             tz               = _mm_mul_ps(fscal,dz31);
2491
2492             /* Update vectorial force */
2493             fix3             = _mm_add_ps(fix3,tx);
2494             fiy3             = _mm_add_ps(fiy3,ty);
2495             fiz3             = _mm_add_ps(fiz3,tz);
2496
2497             fjx1             = _mm_add_ps(fjx1,tx);
2498             fjy1             = _mm_add_ps(fjy1,ty);
2499             fjz1             = _mm_add_ps(fjz1,tz);
2500
2501             }
2502
2503             /**************************
2504              * CALCULATE INTERACTIONS *
2505              **************************/
2506
2507             if (gmx_mm_any_lt(rsq32,rcutoff2))
2508             {
2509
2510             r32              = _mm_mul_ps(rsq32,rinv32);
2511
2512             /* EWALD ELECTROSTATICS */
2513
2514             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2515             ewrt             = _mm_mul_ps(r32,ewtabscale);
2516             ewitab           = _mm_cvttps_epi32(ewrt);
2517             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2518             ewitab           = _mm_slli_epi32(ewitab,2);
2519             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2520             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2521             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2522             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2523             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2524             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2525             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2526             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2527             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2528
2529             d                = _mm_sub_ps(r32,rswitch);
2530             d                = _mm_max_ps(d,_mm_setzero_ps());
2531             d2               = _mm_mul_ps(d,d);
2532             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2533
2534             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2535
2536             /* Evaluate switch function */
2537             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2538             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2539             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2540
2541             fscal            = felec;
2542
2543             fscal            = _mm_and_ps(fscal,cutoff_mask);
2544
2545             /* Calculate temporary vectorial force */
2546             tx               = _mm_mul_ps(fscal,dx32);
2547             ty               = _mm_mul_ps(fscal,dy32);
2548             tz               = _mm_mul_ps(fscal,dz32);
2549
2550             /* Update vectorial force */
2551             fix3             = _mm_add_ps(fix3,tx);
2552             fiy3             = _mm_add_ps(fiy3,ty);
2553             fiz3             = _mm_add_ps(fiz3,tz);
2554
2555             fjx2             = _mm_add_ps(fjx2,tx);
2556             fjy2             = _mm_add_ps(fjy2,ty);
2557             fjz2             = _mm_add_ps(fjz2,tz);
2558
2559             }
2560
2561             /**************************
2562              * CALCULATE INTERACTIONS *
2563              **************************/
2564
2565             if (gmx_mm_any_lt(rsq33,rcutoff2))
2566             {
2567
2568             r33              = _mm_mul_ps(rsq33,rinv33);
2569
2570             /* EWALD ELECTROSTATICS */
2571
2572             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2573             ewrt             = _mm_mul_ps(r33,ewtabscale);
2574             ewitab           = _mm_cvttps_epi32(ewrt);
2575             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2576             ewitab           = _mm_slli_epi32(ewitab,2);
2577             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2578             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2579             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2580             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2581             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2582             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2583             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2584             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
2585             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2586
2587             d                = _mm_sub_ps(r33,rswitch);
2588             d                = _mm_max_ps(d,_mm_setzero_ps());
2589             d2               = _mm_mul_ps(d,d);
2590             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2591
2592             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2593
2594             /* Evaluate switch function */
2595             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2596             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2597             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2598
2599             fscal            = felec;
2600
2601             fscal            = _mm_and_ps(fscal,cutoff_mask);
2602
2603             /* Calculate temporary vectorial force */
2604             tx               = _mm_mul_ps(fscal,dx33);
2605             ty               = _mm_mul_ps(fscal,dy33);
2606             tz               = _mm_mul_ps(fscal,dz33);
2607
2608             /* Update vectorial force */
2609             fix3             = _mm_add_ps(fix3,tx);
2610             fiy3             = _mm_add_ps(fiy3,ty);
2611             fiz3             = _mm_add_ps(fiz3,tz);
2612
2613             fjx3             = _mm_add_ps(fjx3,tx);
2614             fjy3             = _mm_add_ps(fjy3,ty);
2615             fjz3             = _mm_add_ps(fjz3,tz);
2616
2617             }
2618
2619             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
2620                                                    f+j_coord_offsetC,f+j_coord_offsetD,
2621                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2622                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2623
2624             /* Inner loop uses 617 flops */
2625         }
2626
2627         if(jidx<j_index_end)
2628         {
2629
2630             /* Get j neighbor index, and coordinate index */
2631             jnrA             = jjnr[jidx];
2632             jnrB             = jjnr[jidx+1];
2633             jnrC             = jjnr[jidx+2];
2634             jnrD             = jjnr[jidx+3];
2635
2636             /* Sign of each element will be negative for non-real atoms.
2637              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2638              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2639              */
2640             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2641             jnrA       = (jnrA>=0) ? jnrA : 0;
2642             jnrB       = (jnrB>=0) ? jnrB : 0;
2643             jnrC       = (jnrC>=0) ? jnrC : 0;
2644             jnrD       = (jnrD>=0) ? jnrD : 0;
2645
2646             j_coord_offsetA  = DIM*jnrA;
2647             j_coord_offsetB  = DIM*jnrB;
2648             j_coord_offsetC  = DIM*jnrC;
2649             j_coord_offsetD  = DIM*jnrD;
2650
2651             /* load j atom coordinates */
2652             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2653                                               x+j_coord_offsetC,x+j_coord_offsetD,
2654                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2655                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2656
2657             /* Calculate displacement vector */
2658             dx00             = _mm_sub_ps(ix0,jx0);
2659             dy00             = _mm_sub_ps(iy0,jy0);
2660             dz00             = _mm_sub_ps(iz0,jz0);
2661             dx11             = _mm_sub_ps(ix1,jx1);
2662             dy11             = _mm_sub_ps(iy1,jy1);
2663             dz11             = _mm_sub_ps(iz1,jz1);
2664             dx12             = _mm_sub_ps(ix1,jx2);
2665             dy12             = _mm_sub_ps(iy1,jy2);
2666             dz12             = _mm_sub_ps(iz1,jz2);
2667             dx13             = _mm_sub_ps(ix1,jx3);
2668             dy13             = _mm_sub_ps(iy1,jy3);
2669             dz13             = _mm_sub_ps(iz1,jz3);
2670             dx21             = _mm_sub_ps(ix2,jx1);
2671             dy21             = _mm_sub_ps(iy2,jy1);
2672             dz21             = _mm_sub_ps(iz2,jz1);
2673             dx22             = _mm_sub_ps(ix2,jx2);
2674             dy22             = _mm_sub_ps(iy2,jy2);
2675             dz22             = _mm_sub_ps(iz2,jz2);
2676             dx23             = _mm_sub_ps(ix2,jx3);
2677             dy23             = _mm_sub_ps(iy2,jy3);
2678             dz23             = _mm_sub_ps(iz2,jz3);
2679             dx31             = _mm_sub_ps(ix3,jx1);
2680             dy31             = _mm_sub_ps(iy3,jy1);
2681             dz31             = _mm_sub_ps(iz3,jz1);
2682             dx32             = _mm_sub_ps(ix3,jx2);
2683             dy32             = _mm_sub_ps(iy3,jy2);
2684             dz32             = _mm_sub_ps(iz3,jz2);
2685             dx33             = _mm_sub_ps(ix3,jx3);
2686             dy33             = _mm_sub_ps(iy3,jy3);
2687             dz33             = _mm_sub_ps(iz3,jz3);
2688
2689             /* Calculate squared distance and things based on it */
2690             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2691             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2692             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2693             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2694             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2695             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2696             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2697             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2698             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2699             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2700
2701             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2702             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2703             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2704             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2705             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2706             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2707             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2708             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2709             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2710             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2711
2712             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2713             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2714             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2715             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2716             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2717             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2718             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2719             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2720             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2721             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2722
2723             fjx0             = _mm_setzero_ps();
2724             fjy0             = _mm_setzero_ps();
2725             fjz0             = _mm_setzero_ps();
2726             fjx1             = _mm_setzero_ps();
2727             fjy1             = _mm_setzero_ps();
2728             fjz1             = _mm_setzero_ps();
2729             fjx2             = _mm_setzero_ps();
2730             fjy2             = _mm_setzero_ps();
2731             fjz2             = _mm_setzero_ps();
2732             fjx3             = _mm_setzero_ps();
2733             fjy3             = _mm_setzero_ps();
2734             fjz3             = _mm_setzero_ps();
2735
2736             /**************************
2737              * CALCULATE INTERACTIONS *
2738              **************************/
2739
2740             if (gmx_mm_any_lt(rsq00,rcutoff2))
2741             {
2742
2743             r00              = _mm_mul_ps(rsq00,rinv00);
2744             r00              = _mm_andnot_ps(dummy_mask,r00);
2745
2746             /* LENNARD-JONES DISPERSION/REPULSION */
2747
2748             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2749             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
2750             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2751             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2752             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2753
2754             d                = _mm_sub_ps(r00,rswitch);
2755             d                = _mm_max_ps(d,_mm_setzero_ps());
2756             d2               = _mm_mul_ps(d,d);
2757             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2758
2759             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2760
2761             /* Evaluate switch function */
2762             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2763             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2764             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2765
2766             fscal            = fvdw;
2767
2768             fscal            = _mm_and_ps(fscal,cutoff_mask);
2769
2770             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2771
2772             /* Calculate temporary vectorial force */
2773             tx               = _mm_mul_ps(fscal,dx00);
2774             ty               = _mm_mul_ps(fscal,dy00);
2775             tz               = _mm_mul_ps(fscal,dz00);
2776
2777             /* Update vectorial force */
2778             fix0             = _mm_add_ps(fix0,tx);
2779             fiy0             = _mm_add_ps(fiy0,ty);
2780             fiz0             = _mm_add_ps(fiz0,tz);
2781
2782             fjx0             = _mm_add_ps(fjx0,tx);
2783             fjy0             = _mm_add_ps(fjy0,ty);
2784             fjz0             = _mm_add_ps(fjz0,tz);
2785
2786             }
2787
2788             /**************************
2789              * CALCULATE INTERACTIONS *
2790              **************************/
2791
2792             if (gmx_mm_any_lt(rsq11,rcutoff2))
2793             {
2794
2795             r11              = _mm_mul_ps(rsq11,rinv11);
2796             r11              = _mm_andnot_ps(dummy_mask,r11);
2797
2798             /* EWALD ELECTROSTATICS */
2799
2800             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2801             ewrt             = _mm_mul_ps(r11,ewtabscale);
2802             ewitab           = _mm_cvttps_epi32(ewrt);
2803             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2804             ewitab           = _mm_slli_epi32(ewitab,2);
2805             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2806             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2807             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2808             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2809             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2810             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2811             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2812             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2813             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2814
2815             d                = _mm_sub_ps(r11,rswitch);
2816             d                = _mm_max_ps(d,_mm_setzero_ps());
2817             d2               = _mm_mul_ps(d,d);
2818             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2819
2820             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2821
2822             /* Evaluate switch function */
2823             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2824             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2825             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2826
2827             fscal            = felec;
2828
2829             fscal            = _mm_and_ps(fscal,cutoff_mask);
2830
2831             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2832
2833             /* Calculate temporary vectorial force */
2834             tx               = _mm_mul_ps(fscal,dx11);
2835             ty               = _mm_mul_ps(fscal,dy11);
2836             tz               = _mm_mul_ps(fscal,dz11);
2837
2838             /* Update vectorial force */
2839             fix1             = _mm_add_ps(fix1,tx);
2840             fiy1             = _mm_add_ps(fiy1,ty);
2841             fiz1             = _mm_add_ps(fiz1,tz);
2842
2843             fjx1             = _mm_add_ps(fjx1,tx);
2844             fjy1             = _mm_add_ps(fjy1,ty);
2845             fjz1             = _mm_add_ps(fjz1,tz);
2846
2847             }
2848
2849             /**************************
2850              * CALCULATE INTERACTIONS *
2851              **************************/
2852
2853             if (gmx_mm_any_lt(rsq12,rcutoff2))
2854             {
2855
2856             r12              = _mm_mul_ps(rsq12,rinv12);
2857             r12              = _mm_andnot_ps(dummy_mask,r12);
2858
2859             /* EWALD ELECTROSTATICS */
2860
2861             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2862             ewrt             = _mm_mul_ps(r12,ewtabscale);
2863             ewitab           = _mm_cvttps_epi32(ewrt);
2864             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2865             ewitab           = _mm_slli_epi32(ewitab,2);
2866             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2867             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2868             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2869             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2870             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2871             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2872             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2873             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2874             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2875
2876             d                = _mm_sub_ps(r12,rswitch);
2877             d                = _mm_max_ps(d,_mm_setzero_ps());
2878             d2               = _mm_mul_ps(d,d);
2879             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2880
2881             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2882
2883             /* Evaluate switch function */
2884             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2885             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2886             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2887
2888             fscal            = felec;
2889
2890             fscal            = _mm_and_ps(fscal,cutoff_mask);
2891
2892             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2893
2894             /* Calculate temporary vectorial force */
2895             tx               = _mm_mul_ps(fscal,dx12);
2896             ty               = _mm_mul_ps(fscal,dy12);
2897             tz               = _mm_mul_ps(fscal,dz12);
2898
2899             /* Update vectorial force */
2900             fix1             = _mm_add_ps(fix1,tx);
2901             fiy1             = _mm_add_ps(fiy1,ty);
2902             fiz1             = _mm_add_ps(fiz1,tz);
2903
2904             fjx2             = _mm_add_ps(fjx2,tx);
2905             fjy2             = _mm_add_ps(fjy2,ty);
2906             fjz2             = _mm_add_ps(fjz2,tz);
2907
2908             }
2909
2910             /**************************
2911              * CALCULATE INTERACTIONS *
2912              **************************/
2913
2914             if (gmx_mm_any_lt(rsq13,rcutoff2))
2915             {
2916
2917             r13              = _mm_mul_ps(rsq13,rinv13);
2918             r13              = _mm_andnot_ps(dummy_mask,r13);
2919
2920             /* EWALD ELECTROSTATICS */
2921
2922             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2923             ewrt             = _mm_mul_ps(r13,ewtabscale);
2924             ewitab           = _mm_cvttps_epi32(ewrt);
2925             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2926             ewitab           = _mm_slli_epi32(ewitab,2);
2927             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2928             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2929             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2930             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2931             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2932             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2933             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2934             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2935             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2936
2937             d                = _mm_sub_ps(r13,rswitch);
2938             d                = _mm_max_ps(d,_mm_setzero_ps());
2939             d2               = _mm_mul_ps(d,d);
2940             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2941
2942             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2943
2944             /* Evaluate switch function */
2945             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2946             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2947             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2948
2949             fscal            = felec;
2950
2951             fscal            = _mm_and_ps(fscal,cutoff_mask);
2952
2953             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2954
2955             /* Calculate temporary vectorial force */
2956             tx               = _mm_mul_ps(fscal,dx13);
2957             ty               = _mm_mul_ps(fscal,dy13);
2958             tz               = _mm_mul_ps(fscal,dz13);
2959
2960             /* Update vectorial force */
2961             fix1             = _mm_add_ps(fix1,tx);
2962             fiy1             = _mm_add_ps(fiy1,ty);
2963             fiz1             = _mm_add_ps(fiz1,tz);
2964
2965             fjx3             = _mm_add_ps(fjx3,tx);
2966             fjy3             = _mm_add_ps(fjy3,ty);
2967             fjz3             = _mm_add_ps(fjz3,tz);
2968
2969             }
2970
2971             /**************************
2972              * CALCULATE INTERACTIONS *
2973              **************************/
2974
2975             if (gmx_mm_any_lt(rsq21,rcutoff2))
2976             {
2977
2978             r21              = _mm_mul_ps(rsq21,rinv21);
2979             r21              = _mm_andnot_ps(dummy_mask,r21);
2980
2981             /* EWALD ELECTROSTATICS */
2982
2983             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2984             ewrt             = _mm_mul_ps(r21,ewtabscale);
2985             ewitab           = _mm_cvttps_epi32(ewrt);
2986             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2987             ewitab           = _mm_slli_epi32(ewitab,2);
2988             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2989             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2990             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2991             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2992             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2993             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2994             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2995             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2996             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2997
2998             d                = _mm_sub_ps(r21,rswitch);
2999             d                = _mm_max_ps(d,_mm_setzero_ps());
3000             d2               = _mm_mul_ps(d,d);
3001             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3002
3003             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3004
3005             /* Evaluate switch function */
3006             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3007             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
3008             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
3009
3010             fscal            = felec;
3011
3012             fscal            = _mm_and_ps(fscal,cutoff_mask);
3013
3014             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3015
3016             /* Calculate temporary vectorial force */
3017             tx               = _mm_mul_ps(fscal,dx21);
3018             ty               = _mm_mul_ps(fscal,dy21);
3019             tz               = _mm_mul_ps(fscal,dz21);
3020
3021             /* Update vectorial force */
3022             fix2             = _mm_add_ps(fix2,tx);
3023             fiy2             = _mm_add_ps(fiy2,ty);
3024             fiz2             = _mm_add_ps(fiz2,tz);
3025
3026             fjx1             = _mm_add_ps(fjx1,tx);
3027             fjy1             = _mm_add_ps(fjy1,ty);
3028             fjz1             = _mm_add_ps(fjz1,tz);
3029
3030             }
3031
3032             /**************************
3033              * CALCULATE INTERACTIONS *
3034              **************************/
3035
3036             if (gmx_mm_any_lt(rsq22,rcutoff2))
3037             {
3038
3039             r22              = _mm_mul_ps(rsq22,rinv22);
3040             r22              = _mm_andnot_ps(dummy_mask,r22);
3041
3042             /* EWALD ELECTROSTATICS */
3043
3044             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3045             ewrt             = _mm_mul_ps(r22,ewtabscale);
3046             ewitab           = _mm_cvttps_epi32(ewrt);
3047             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3048             ewitab           = _mm_slli_epi32(ewitab,2);
3049             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3050             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3051             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3052             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3053             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3054             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3055             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3056             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3057             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3058
3059             d                = _mm_sub_ps(r22,rswitch);
3060             d                = _mm_max_ps(d,_mm_setzero_ps());
3061             d2               = _mm_mul_ps(d,d);
3062             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3063
3064             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3065
3066             /* Evaluate switch function */
3067             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3068             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3069             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
3070
3071             fscal            = felec;
3072
3073             fscal            = _mm_and_ps(fscal,cutoff_mask);
3074
3075             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3076
3077             /* Calculate temporary vectorial force */
3078             tx               = _mm_mul_ps(fscal,dx22);
3079             ty               = _mm_mul_ps(fscal,dy22);
3080             tz               = _mm_mul_ps(fscal,dz22);
3081
3082             /* Update vectorial force */
3083             fix2             = _mm_add_ps(fix2,tx);
3084             fiy2             = _mm_add_ps(fiy2,ty);
3085             fiz2             = _mm_add_ps(fiz2,tz);
3086
3087             fjx2             = _mm_add_ps(fjx2,tx);
3088             fjy2             = _mm_add_ps(fjy2,ty);
3089             fjz2             = _mm_add_ps(fjz2,tz);
3090
3091             }
3092
3093             /**************************
3094              * CALCULATE INTERACTIONS *
3095              **************************/
3096
3097             if (gmx_mm_any_lt(rsq23,rcutoff2))
3098             {
3099
3100             r23              = _mm_mul_ps(rsq23,rinv23);
3101             r23              = _mm_andnot_ps(dummy_mask,r23);
3102
3103             /* EWALD ELECTROSTATICS */
3104
3105             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3106             ewrt             = _mm_mul_ps(r23,ewtabscale);
3107             ewitab           = _mm_cvttps_epi32(ewrt);
3108             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3109             ewitab           = _mm_slli_epi32(ewitab,2);
3110             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3111             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3112             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3113             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3114             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3115             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3116             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3117             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
3118             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
3119
3120             d                = _mm_sub_ps(r23,rswitch);
3121             d                = _mm_max_ps(d,_mm_setzero_ps());
3122             d2               = _mm_mul_ps(d,d);
3123             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3124
3125             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3126
3127             /* Evaluate switch function */
3128             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3129             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
3130             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
3131
3132             fscal            = felec;
3133
3134             fscal            = _mm_and_ps(fscal,cutoff_mask);
3135
3136             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3137
3138             /* Calculate temporary vectorial force */
3139             tx               = _mm_mul_ps(fscal,dx23);
3140             ty               = _mm_mul_ps(fscal,dy23);
3141             tz               = _mm_mul_ps(fscal,dz23);
3142
3143             /* Update vectorial force */
3144             fix2             = _mm_add_ps(fix2,tx);
3145             fiy2             = _mm_add_ps(fiy2,ty);
3146             fiz2             = _mm_add_ps(fiz2,tz);
3147
3148             fjx3             = _mm_add_ps(fjx3,tx);
3149             fjy3             = _mm_add_ps(fjy3,ty);
3150             fjz3             = _mm_add_ps(fjz3,tz);
3151
3152             }
3153
3154             /**************************
3155              * CALCULATE INTERACTIONS *
3156              **************************/
3157
3158             if (gmx_mm_any_lt(rsq31,rcutoff2))
3159             {
3160
3161             r31              = _mm_mul_ps(rsq31,rinv31);
3162             r31              = _mm_andnot_ps(dummy_mask,r31);
3163
3164             /* EWALD ELECTROSTATICS */
3165
3166             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3167             ewrt             = _mm_mul_ps(r31,ewtabscale);
3168             ewitab           = _mm_cvttps_epi32(ewrt);
3169             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3170             ewitab           = _mm_slli_epi32(ewitab,2);
3171             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3172             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3173             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3174             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3175             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3176             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3177             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3178             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
3179             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
3180
3181             d                = _mm_sub_ps(r31,rswitch);
3182             d                = _mm_max_ps(d,_mm_setzero_ps());
3183             d2               = _mm_mul_ps(d,d);
3184             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3185
3186             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3187
3188             /* Evaluate switch function */
3189             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3190             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
3191             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
3192
3193             fscal            = felec;
3194
3195             fscal            = _mm_and_ps(fscal,cutoff_mask);
3196
3197             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3198
3199             /* Calculate temporary vectorial force */
3200             tx               = _mm_mul_ps(fscal,dx31);
3201             ty               = _mm_mul_ps(fscal,dy31);
3202             tz               = _mm_mul_ps(fscal,dz31);
3203
3204             /* Update vectorial force */
3205             fix3             = _mm_add_ps(fix3,tx);
3206             fiy3             = _mm_add_ps(fiy3,ty);
3207             fiz3             = _mm_add_ps(fiz3,tz);
3208
3209             fjx1             = _mm_add_ps(fjx1,tx);
3210             fjy1             = _mm_add_ps(fjy1,ty);
3211             fjz1             = _mm_add_ps(fjz1,tz);
3212
3213             }
3214
3215             /**************************
3216              * CALCULATE INTERACTIONS *
3217              **************************/
3218
3219             if (gmx_mm_any_lt(rsq32,rcutoff2))
3220             {
3221
3222             r32              = _mm_mul_ps(rsq32,rinv32);
3223             r32              = _mm_andnot_ps(dummy_mask,r32);
3224
3225             /* EWALD ELECTROSTATICS */
3226
3227             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3228             ewrt             = _mm_mul_ps(r32,ewtabscale);
3229             ewitab           = _mm_cvttps_epi32(ewrt);
3230             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3231             ewitab           = _mm_slli_epi32(ewitab,2);
3232             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3233             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3234             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3235             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3236             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3237             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3238             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3239             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
3240             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
3241
3242             d                = _mm_sub_ps(r32,rswitch);
3243             d                = _mm_max_ps(d,_mm_setzero_ps());
3244             d2               = _mm_mul_ps(d,d);
3245             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3246
3247             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3248
3249             /* Evaluate switch function */
3250             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3251             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
3252             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
3253
3254             fscal            = felec;
3255
3256             fscal            = _mm_and_ps(fscal,cutoff_mask);
3257
3258             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3259
3260             /* Calculate temporary vectorial force */
3261             tx               = _mm_mul_ps(fscal,dx32);
3262             ty               = _mm_mul_ps(fscal,dy32);
3263             tz               = _mm_mul_ps(fscal,dz32);
3264
3265             /* Update vectorial force */
3266             fix3             = _mm_add_ps(fix3,tx);
3267             fiy3             = _mm_add_ps(fiy3,ty);
3268             fiz3             = _mm_add_ps(fiz3,tz);
3269
3270             fjx2             = _mm_add_ps(fjx2,tx);
3271             fjy2             = _mm_add_ps(fjy2,ty);
3272             fjz2             = _mm_add_ps(fjz2,tz);
3273
3274             }
3275
3276             /**************************
3277              * CALCULATE INTERACTIONS *
3278              **************************/
3279
3280             if (gmx_mm_any_lt(rsq33,rcutoff2))
3281             {
3282
3283             r33              = _mm_mul_ps(rsq33,rinv33);
3284             r33              = _mm_andnot_ps(dummy_mask,r33);
3285
3286             /* EWALD ELECTROSTATICS */
3287
3288             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3289             ewrt             = _mm_mul_ps(r33,ewtabscale);
3290             ewitab           = _mm_cvttps_epi32(ewrt);
3291             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3292             ewitab           = _mm_slli_epi32(ewitab,2);
3293             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3294             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3295             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3296             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3297             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3298             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3299             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3300             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
3301             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
3302
3303             d                = _mm_sub_ps(r33,rswitch);
3304             d                = _mm_max_ps(d,_mm_setzero_ps());
3305             d2               = _mm_mul_ps(d,d);
3306             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3307
3308             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3309
3310             /* Evaluate switch function */
3311             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3312             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
3313             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
3314
3315             fscal            = felec;
3316
3317             fscal            = _mm_and_ps(fscal,cutoff_mask);
3318
3319             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3320
3321             /* Calculate temporary vectorial force */
3322             tx               = _mm_mul_ps(fscal,dx33);
3323             ty               = _mm_mul_ps(fscal,dy33);
3324             tz               = _mm_mul_ps(fscal,dz33);
3325
3326             /* Update vectorial force */
3327             fix3             = _mm_add_ps(fix3,tx);
3328             fiy3             = _mm_add_ps(fiy3,ty);
3329             fiz3             = _mm_add_ps(fiz3,tz);
3330
3331             fjx3             = _mm_add_ps(fjx3,tx);
3332             fjy3             = _mm_add_ps(fjy3,ty);
3333             fjz3             = _mm_add_ps(fjz3,tz);
3334
3335             }
3336
3337             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(f+j_coord_offsetA,f+j_coord_offsetB,
3338                                                    f+j_coord_offsetC,f+j_coord_offsetD,
3339                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3340                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3341
3342             /* Inner loop uses 627 flops */
3343         }
3344
3345         /* End of innermost loop */
3346
3347         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3348                                               f+i_coord_offset,fshift+i_shift_offset);
3349
3350         /* Increment number of inner iterations */
3351         inneriter                  += j_index_end - j_index_start;
3352
3353         /* Outer loop uses 36 flops */
3354     }
3355
3356     /* Increment number of outer iterations */
3357     outeriter        += nri;
3358
3359     /* Update outer/inner flops */
3360
3361     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*36 + inneriter*627);
3362 }