98f1e5ff892503eb44b3e240a605fa6d4d774858
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_sse2_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water4-Water4
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_VF_sse2_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
64     real             rcutoff_scalar;
65     real             *shiftvec,*fshift,*x,*f;
66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
67     real             scratch[4*DIM];
68     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
69     int              vdwioffset0;
70     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
71     int              vdwioffset1;
72     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
73     int              vdwioffset2;
74     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75     int              vdwioffset3;
76     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
77     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
78     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
79     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
80     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
81     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
82     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
83     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
84     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
85     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
86     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
89     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
92     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
93     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
94     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
95     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
96     real             *charge;
97     int              nvdwtype;
98     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
99     int              *vdwtype;
100     real             *vdwparam;
101     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
102     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
103     __m128i          ewitab;
104     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
105     real             *ewtab;
106     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
107     real             rswitch_scalar,d_scalar;
108     __m128           dummy_mask,cutoff_mask;
109     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
110     __m128           one     = _mm_set1_ps(1.0);
111     __m128           two     = _mm_set1_ps(2.0);
112     x                = xx[0];
113     f                = ff[0];
114
115     nri              = nlist->nri;
116     iinr             = nlist->iinr;
117     jindex           = nlist->jindex;
118     jjnr             = nlist->jjnr;
119     shiftidx         = nlist->shift;
120     gid              = nlist->gid;
121     shiftvec         = fr->shift_vec[0];
122     fshift           = fr->fshift[0];
123     facel            = _mm_set1_ps(fr->epsfac);
124     charge           = mdatoms->chargeA;
125     nvdwtype         = fr->ntype;
126     vdwparam         = fr->nbfp;
127     vdwtype          = mdatoms->typeA;
128
129     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
130     ewtab            = fr->ic->tabq_coul_FDV0;
131     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
132     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
133
134     /* Setup water-specific parameters */
135     inr              = nlist->iinr[0];
136     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
137     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
138     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
139     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
140
141     jq1              = _mm_set1_ps(charge[inr+1]);
142     jq2              = _mm_set1_ps(charge[inr+2]);
143     jq3              = _mm_set1_ps(charge[inr+3]);
144     vdwjidx0A        = 2*vdwtype[inr+0];
145     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
146     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
147     qq11             = _mm_mul_ps(iq1,jq1);
148     qq12             = _mm_mul_ps(iq1,jq2);
149     qq13             = _mm_mul_ps(iq1,jq3);
150     qq21             = _mm_mul_ps(iq2,jq1);
151     qq22             = _mm_mul_ps(iq2,jq2);
152     qq23             = _mm_mul_ps(iq2,jq3);
153     qq31             = _mm_mul_ps(iq3,jq1);
154     qq32             = _mm_mul_ps(iq3,jq2);
155     qq33             = _mm_mul_ps(iq3,jq3);
156
157     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
158     rcutoff_scalar   = fr->rcoulomb;
159     rcutoff          = _mm_set1_ps(rcutoff_scalar);
160     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
161
162     rswitch_scalar   = fr->rcoulomb_switch;
163     rswitch          = _mm_set1_ps(rswitch_scalar);
164     /* Setup switch parameters */
165     d_scalar         = rcutoff_scalar-rswitch_scalar;
166     d                = _mm_set1_ps(d_scalar);
167     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
168     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
169     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
170     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
171     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
172     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
173
174     /* Avoid stupid compiler warnings */
175     jnrA = jnrB = jnrC = jnrD = 0;
176     j_coord_offsetA = 0;
177     j_coord_offsetB = 0;
178     j_coord_offsetC = 0;
179     j_coord_offsetD = 0;
180
181     outeriter        = 0;
182     inneriter        = 0;
183
184     for(iidx=0;iidx<4*DIM;iidx++)
185     {
186         scratch[iidx] = 0.0;
187     }  
188
189     /* Start outer loop over neighborlists */
190     for(iidx=0; iidx<nri; iidx++)
191     {
192         /* Load shift vector for this list */
193         i_shift_offset   = DIM*shiftidx[iidx];
194
195         /* Load limits for loop over neighbors */
196         j_index_start    = jindex[iidx];
197         j_index_end      = jindex[iidx+1];
198
199         /* Get outer coordinate index */
200         inr              = iinr[iidx];
201         i_coord_offset   = DIM*inr;
202
203         /* Load i particle coords and add shift vector */
204         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
205                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
206         
207         fix0             = _mm_setzero_ps();
208         fiy0             = _mm_setzero_ps();
209         fiz0             = _mm_setzero_ps();
210         fix1             = _mm_setzero_ps();
211         fiy1             = _mm_setzero_ps();
212         fiz1             = _mm_setzero_ps();
213         fix2             = _mm_setzero_ps();
214         fiy2             = _mm_setzero_ps();
215         fiz2             = _mm_setzero_ps();
216         fix3             = _mm_setzero_ps();
217         fiy3             = _mm_setzero_ps();
218         fiz3             = _mm_setzero_ps();
219
220         /* Reset potential sums */
221         velecsum         = _mm_setzero_ps();
222         vvdwsum          = _mm_setzero_ps();
223
224         /* Start inner kernel loop */
225         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
226         {
227
228             /* Get j neighbor index, and coordinate index */
229             jnrA             = jjnr[jidx];
230             jnrB             = jjnr[jidx+1];
231             jnrC             = jjnr[jidx+2];
232             jnrD             = jjnr[jidx+3];
233             j_coord_offsetA  = DIM*jnrA;
234             j_coord_offsetB  = DIM*jnrB;
235             j_coord_offsetC  = DIM*jnrC;
236             j_coord_offsetD  = DIM*jnrD;
237
238             /* load j atom coordinates */
239             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
240                                               x+j_coord_offsetC,x+j_coord_offsetD,
241                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
242                                               &jy2,&jz2,&jx3,&jy3,&jz3);
243
244             /* Calculate displacement vector */
245             dx00             = _mm_sub_ps(ix0,jx0);
246             dy00             = _mm_sub_ps(iy0,jy0);
247             dz00             = _mm_sub_ps(iz0,jz0);
248             dx11             = _mm_sub_ps(ix1,jx1);
249             dy11             = _mm_sub_ps(iy1,jy1);
250             dz11             = _mm_sub_ps(iz1,jz1);
251             dx12             = _mm_sub_ps(ix1,jx2);
252             dy12             = _mm_sub_ps(iy1,jy2);
253             dz12             = _mm_sub_ps(iz1,jz2);
254             dx13             = _mm_sub_ps(ix1,jx3);
255             dy13             = _mm_sub_ps(iy1,jy3);
256             dz13             = _mm_sub_ps(iz1,jz3);
257             dx21             = _mm_sub_ps(ix2,jx1);
258             dy21             = _mm_sub_ps(iy2,jy1);
259             dz21             = _mm_sub_ps(iz2,jz1);
260             dx22             = _mm_sub_ps(ix2,jx2);
261             dy22             = _mm_sub_ps(iy2,jy2);
262             dz22             = _mm_sub_ps(iz2,jz2);
263             dx23             = _mm_sub_ps(ix2,jx3);
264             dy23             = _mm_sub_ps(iy2,jy3);
265             dz23             = _mm_sub_ps(iz2,jz3);
266             dx31             = _mm_sub_ps(ix3,jx1);
267             dy31             = _mm_sub_ps(iy3,jy1);
268             dz31             = _mm_sub_ps(iz3,jz1);
269             dx32             = _mm_sub_ps(ix3,jx2);
270             dy32             = _mm_sub_ps(iy3,jy2);
271             dz32             = _mm_sub_ps(iz3,jz2);
272             dx33             = _mm_sub_ps(ix3,jx3);
273             dy33             = _mm_sub_ps(iy3,jy3);
274             dz33             = _mm_sub_ps(iz3,jz3);
275
276             /* Calculate squared distance and things based on it */
277             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
278             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
279             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
280             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
281             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
282             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
283             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
284             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
285             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
286             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
287
288             rinv00           = gmx_mm_invsqrt_ps(rsq00);
289             rinv11           = gmx_mm_invsqrt_ps(rsq11);
290             rinv12           = gmx_mm_invsqrt_ps(rsq12);
291             rinv13           = gmx_mm_invsqrt_ps(rsq13);
292             rinv21           = gmx_mm_invsqrt_ps(rsq21);
293             rinv22           = gmx_mm_invsqrt_ps(rsq22);
294             rinv23           = gmx_mm_invsqrt_ps(rsq23);
295             rinv31           = gmx_mm_invsqrt_ps(rsq31);
296             rinv32           = gmx_mm_invsqrt_ps(rsq32);
297             rinv33           = gmx_mm_invsqrt_ps(rsq33);
298
299             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
300             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
301             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
302             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
303             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
304             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
305             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
306             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
307             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
308             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
309
310             fjx0             = _mm_setzero_ps();
311             fjy0             = _mm_setzero_ps();
312             fjz0             = _mm_setzero_ps();
313             fjx1             = _mm_setzero_ps();
314             fjy1             = _mm_setzero_ps();
315             fjz1             = _mm_setzero_ps();
316             fjx2             = _mm_setzero_ps();
317             fjy2             = _mm_setzero_ps();
318             fjz2             = _mm_setzero_ps();
319             fjx3             = _mm_setzero_ps();
320             fjy3             = _mm_setzero_ps();
321             fjz3             = _mm_setzero_ps();
322
323             /**************************
324              * CALCULATE INTERACTIONS *
325              **************************/
326
327             if (gmx_mm_any_lt(rsq00,rcutoff2))
328             {
329
330             r00              = _mm_mul_ps(rsq00,rinv00);
331
332             /* LENNARD-JONES DISPERSION/REPULSION */
333
334             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
335             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
336             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
337             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
338             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
339
340             d                = _mm_sub_ps(r00,rswitch);
341             d                = _mm_max_ps(d,_mm_setzero_ps());
342             d2               = _mm_mul_ps(d,d);
343             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
344
345             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
346
347             /* Evaluate switch function */
348             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
349             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
350             vvdw             = _mm_mul_ps(vvdw,sw);
351             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
352
353             /* Update potential sum for this i atom from the interaction with this j atom. */
354             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
355             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
356
357             fscal            = fvdw;
358
359             fscal            = _mm_and_ps(fscal,cutoff_mask);
360
361             /* Calculate temporary vectorial force */
362             tx               = _mm_mul_ps(fscal,dx00);
363             ty               = _mm_mul_ps(fscal,dy00);
364             tz               = _mm_mul_ps(fscal,dz00);
365
366             /* Update vectorial force */
367             fix0             = _mm_add_ps(fix0,tx);
368             fiy0             = _mm_add_ps(fiy0,ty);
369             fiz0             = _mm_add_ps(fiz0,tz);
370
371             fjx0             = _mm_add_ps(fjx0,tx);
372             fjy0             = _mm_add_ps(fjy0,ty);
373             fjz0             = _mm_add_ps(fjz0,tz);
374             
375             }
376
377             /**************************
378              * CALCULATE INTERACTIONS *
379              **************************/
380
381             if (gmx_mm_any_lt(rsq11,rcutoff2))
382             {
383
384             r11              = _mm_mul_ps(rsq11,rinv11);
385
386             /* EWALD ELECTROSTATICS */
387
388             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
389             ewrt             = _mm_mul_ps(r11,ewtabscale);
390             ewitab           = _mm_cvttps_epi32(ewrt);
391             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
392             ewitab           = _mm_slli_epi32(ewitab,2);
393             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
394             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
395             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
396             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
397             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
398             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
399             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
400             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
401             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
402
403             d                = _mm_sub_ps(r11,rswitch);
404             d                = _mm_max_ps(d,_mm_setzero_ps());
405             d2               = _mm_mul_ps(d,d);
406             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
407
408             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
409
410             /* Evaluate switch function */
411             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
412             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
413             velec            = _mm_mul_ps(velec,sw);
414             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
415
416             /* Update potential sum for this i atom from the interaction with this j atom. */
417             velec            = _mm_and_ps(velec,cutoff_mask);
418             velecsum         = _mm_add_ps(velecsum,velec);
419
420             fscal            = felec;
421
422             fscal            = _mm_and_ps(fscal,cutoff_mask);
423
424             /* Calculate temporary vectorial force */
425             tx               = _mm_mul_ps(fscal,dx11);
426             ty               = _mm_mul_ps(fscal,dy11);
427             tz               = _mm_mul_ps(fscal,dz11);
428
429             /* Update vectorial force */
430             fix1             = _mm_add_ps(fix1,tx);
431             fiy1             = _mm_add_ps(fiy1,ty);
432             fiz1             = _mm_add_ps(fiz1,tz);
433
434             fjx1             = _mm_add_ps(fjx1,tx);
435             fjy1             = _mm_add_ps(fjy1,ty);
436             fjz1             = _mm_add_ps(fjz1,tz);
437             
438             }
439
440             /**************************
441              * CALCULATE INTERACTIONS *
442              **************************/
443
444             if (gmx_mm_any_lt(rsq12,rcutoff2))
445             {
446
447             r12              = _mm_mul_ps(rsq12,rinv12);
448
449             /* EWALD ELECTROSTATICS */
450
451             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
452             ewrt             = _mm_mul_ps(r12,ewtabscale);
453             ewitab           = _mm_cvttps_epi32(ewrt);
454             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
455             ewitab           = _mm_slli_epi32(ewitab,2);
456             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
457             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
458             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
459             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
460             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
461             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
462             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
463             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
464             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
465
466             d                = _mm_sub_ps(r12,rswitch);
467             d                = _mm_max_ps(d,_mm_setzero_ps());
468             d2               = _mm_mul_ps(d,d);
469             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
470
471             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
472
473             /* Evaluate switch function */
474             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
475             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
476             velec            = _mm_mul_ps(velec,sw);
477             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
478
479             /* Update potential sum for this i atom from the interaction with this j atom. */
480             velec            = _mm_and_ps(velec,cutoff_mask);
481             velecsum         = _mm_add_ps(velecsum,velec);
482
483             fscal            = felec;
484
485             fscal            = _mm_and_ps(fscal,cutoff_mask);
486
487             /* Calculate temporary vectorial force */
488             tx               = _mm_mul_ps(fscal,dx12);
489             ty               = _mm_mul_ps(fscal,dy12);
490             tz               = _mm_mul_ps(fscal,dz12);
491
492             /* Update vectorial force */
493             fix1             = _mm_add_ps(fix1,tx);
494             fiy1             = _mm_add_ps(fiy1,ty);
495             fiz1             = _mm_add_ps(fiz1,tz);
496
497             fjx2             = _mm_add_ps(fjx2,tx);
498             fjy2             = _mm_add_ps(fjy2,ty);
499             fjz2             = _mm_add_ps(fjz2,tz);
500             
501             }
502
503             /**************************
504              * CALCULATE INTERACTIONS *
505              **************************/
506
507             if (gmx_mm_any_lt(rsq13,rcutoff2))
508             {
509
510             r13              = _mm_mul_ps(rsq13,rinv13);
511
512             /* EWALD ELECTROSTATICS */
513
514             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
515             ewrt             = _mm_mul_ps(r13,ewtabscale);
516             ewitab           = _mm_cvttps_epi32(ewrt);
517             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
518             ewitab           = _mm_slli_epi32(ewitab,2);
519             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
520             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
521             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
522             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
523             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
524             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
525             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
526             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
527             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
528
529             d                = _mm_sub_ps(r13,rswitch);
530             d                = _mm_max_ps(d,_mm_setzero_ps());
531             d2               = _mm_mul_ps(d,d);
532             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
533
534             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
535
536             /* Evaluate switch function */
537             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
538             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
539             velec            = _mm_mul_ps(velec,sw);
540             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
541
542             /* Update potential sum for this i atom from the interaction with this j atom. */
543             velec            = _mm_and_ps(velec,cutoff_mask);
544             velecsum         = _mm_add_ps(velecsum,velec);
545
546             fscal            = felec;
547
548             fscal            = _mm_and_ps(fscal,cutoff_mask);
549
550             /* Calculate temporary vectorial force */
551             tx               = _mm_mul_ps(fscal,dx13);
552             ty               = _mm_mul_ps(fscal,dy13);
553             tz               = _mm_mul_ps(fscal,dz13);
554
555             /* Update vectorial force */
556             fix1             = _mm_add_ps(fix1,tx);
557             fiy1             = _mm_add_ps(fiy1,ty);
558             fiz1             = _mm_add_ps(fiz1,tz);
559
560             fjx3             = _mm_add_ps(fjx3,tx);
561             fjy3             = _mm_add_ps(fjy3,ty);
562             fjz3             = _mm_add_ps(fjz3,tz);
563             
564             }
565
566             /**************************
567              * CALCULATE INTERACTIONS *
568              **************************/
569
570             if (gmx_mm_any_lt(rsq21,rcutoff2))
571             {
572
573             r21              = _mm_mul_ps(rsq21,rinv21);
574
575             /* EWALD ELECTROSTATICS */
576
577             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
578             ewrt             = _mm_mul_ps(r21,ewtabscale);
579             ewitab           = _mm_cvttps_epi32(ewrt);
580             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
581             ewitab           = _mm_slli_epi32(ewitab,2);
582             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
583             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
584             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
585             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
586             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
587             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
588             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
589             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
590             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
591
592             d                = _mm_sub_ps(r21,rswitch);
593             d                = _mm_max_ps(d,_mm_setzero_ps());
594             d2               = _mm_mul_ps(d,d);
595             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
596
597             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
598
599             /* Evaluate switch function */
600             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
601             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
602             velec            = _mm_mul_ps(velec,sw);
603             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
604
605             /* Update potential sum for this i atom from the interaction with this j atom. */
606             velec            = _mm_and_ps(velec,cutoff_mask);
607             velecsum         = _mm_add_ps(velecsum,velec);
608
609             fscal            = felec;
610
611             fscal            = _mm_and_ps(fscal,cutoff_mask);
612
613             /* Calculate temporary vectorial force */
614             tx               = _mm_mul_ps(fscal,dx21);
615             ty               = _mm_mul_ps(fscal,dy21);
616             tz               = _mm_mul_ps(fscal,dz21);
617
618             /* Update vectorial force */
619             fix2             = _mm_add_ps(fix2,tx);
620             fiy2             = _mm_add_ps(fiy2,ty);
621             fiz2             = _mm_add_ps(fiz2,tz);
622
623             fjx1             = _mm_add_ps(fjx1,tx);
624             fjy1             = _mm_add_ps(fjy1,ty);
625             fjz1             = _mm_add_ps(fjz1,tz);
626             
627             }
628
629             /**************************
630              * CALCULATE INTERACTIONS *
631              **************************/
632
633             if (gmx_mm_any_lt(rsq22,rcutoff2))
634             {
635
636             r22              = _mm_mul_ps(rsq22,rinv22);
637
638             /* EWALD ELECTROSTATICS */
639
640             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
641             ewrt             = _mm_mul_ps(r22,ewtabscale);
642             ewitab           = _mm_cvttps_epi32(ewrt);
643             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
644             ewitab           = _mm_slli_epi32(ewitab,2);
645             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
646             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
647             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
648             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
649             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
650             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
651             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
652             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
653             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
654
655             d                = _mm_sub_ps(r22,rswitch);
656             d                = _mm_max_ps(d,_mm_setzero_ps());
657             d2               = _mm_mul_ps(d,d);
658             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
659
660             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
661
662             /* Evaluate switch function */
663             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
664             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
665             velec            = _mm_mul_ps(velec,sw);
666             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
667
668             /* Update potential sum for this i atom from the interaction with this j atom. */
669             velec            = _mm_and_ps(velec,cutoff_mask);
670             velecsum         = _mm_add_ps(velecsum,velec);
671
672             fscal            = felec;
673
674             fscal            = _mm_and_ps(fscal,cutoff_mask);
675
676             /* Calculate temporary vectorial force */
677             tx               = _mm_mul_ps(fscal,dx22);
678             ty               = _mm_mul_ps(fscal,dy22);
679             tz               = _mm_mul_ps(fscal,dz22);
680
681             /* Update vectorial force */
682             fix2             = _mm_add_ps(fix2,tx);
683             fiy2             = _mm_add_ps(fiy2,ty);
684             fiz2             = _mm_add_ps(fiz2,tz);
685
686             fjx2             = _mm_add_ps(fjx2,tx);
687             fjy2             = _mm_add_ps(fjy2,ty);
688             fjz2             = _mm_add_ps(fjz2,tz);
689             
690             }
691
692             /**************************
693              * CALCULATE INTERACTIONS *
694              **************************/
695
696             if (gmx_mm_any_lt(rsq23,rcutoff2))
697             {
698
699             r23              = _mm_mul_ps(rsq23,rinv23);
700
701             /* EWALD ELECTROSTATICS */
702
703             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
704             ewrt             = _mm_mul_ps(r23,ewtabscale);
705             ewitab           = _mm_cvttps_epi32(ewrt);
706             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
707             ewitab           = _mm_slli_epi32(ewitab,2);
708             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
709             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
710             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
711             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
712             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
713             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
714             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
715             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
716             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
717
718             d                = _mm_sub_ps(r23,rswitch);
719             d                = _mm_max_ps(d,_mm_setzero_ps());
720             d2               = _mm_mul_ps(d,d);
721             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
722
723             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
724
725             /* Evaluate switch function */
726             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
727             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
728             velec            = _mm_mul_ps(velec,sw);
729             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
730
731             /* Update potential sum for this i atom from the interaction with this j atom. */
732             velec            = _mm_and_ps(velec,cutoff_mask);
733             velecsum         = _mm_add_ps(velecsum,velec);
734
735             fscal            = felec;
736
737             fscal            = _mm_and_ps(fscal,cutoff_mask);
738
739             /* Calculate temporary vectorial force */
740             tx               = _mm_mul_ps(fscal,dx23);
741             ty               = _mm_mul_ps(fscal,dy23);
742             tz               = _mm_mul_ps(fscal,dz23);
743
744             /* Update vectorial force */
745             fix2             = _mm_add_ps(fix2,tx);
746             fiy2             = _mm_add_ps(fiy2,ty);
747             fiz2             = _mm_add_ps(fiz2,tz);
748
749             fjx3             = _mm_add_ps(fjx3,tx);
750             fjy3             = _mm_add_ps(fjy3,ty);
751             fjz3             = _mm_add_ps(fjz3,tz);
752             
753             }
754
755             /**************************
756              * CALCULATE INTERACTIONS *
757              **************************/
758
759             if (gmx_mm_any_lt(rsq31,rcutoff2))
760             {
761
762             r31              = _mm_mul_ps(rsq31,rinv31);
763
764             /* EWALD ELECTROSTATICS */
765
766             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
767             ewrt             = _mm_mul_ps(r31,ewtabscale);
768             ewitab           = _mm_cvttps_epi32(ewrt);
769             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
770             ewitab           = _mm_slli_epi32(ewitab,2);
771             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
772             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
773             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
774             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
775             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
776             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
777             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
778             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
779             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
780
781             d                = _mm_sub_ps(r31,rswitch);
782             d                = _mm_max_ps(d,_mm_setzero_ps());
783             d2               = _mm_mul_ps(d,d);
784             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
785
786             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
787
788             /* Evaluate switch function */
789             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
790             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
791             velec            = _mm_mul_ps(velec,sw);
792             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
793
794             /* Update potential sum for this i atom from the interaction with this j atom. */
795             velec            = _mm_and_ps(velec,cutoff_mask);
796             velecsum         = _mm_add_ps(velecsum,velec);
797
798             fscal            = felec;
799
800             fscal            = _mm_and_ps(fscal,cutoff_mask);
801
802             /* Calculate temporary vectorial force */
803             tx               = _mm_mul_ps(fscal,dx31);
804             ty               = _mm_mul_ps(fscal,dy31);
805             tz               = _mm_mul_ps(fscal,dz31);
806
807             /* Update vectorial force */
808             fix3             = _mm_add_ps(fix3,tx);
809             fiy3             = _mm_add_ps(fiy3,ty);
810             fiz3             = _mm_add_ps(fiz3,tz);
811
812             fjx1             = _mm_add_ps(fjx1,tx);
813             fjy1             = _mm_add_ps(fjy1,ty);
814             fjz1             = _mm_add_ps(fjz1,tz);
815             
816             }
817
818             /**************************
819              * CALCULATE INTERACTIONS *
820              **************************/
821
822             if (gmx_mm_any_lt(rsq32,rcutoff2))
823             {
824
825             r32              = _mm_mul_ps(rsq32,rinv32);
826
827             /* EWALD ELECTROSTATICS */
828
829             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
830             ewrt             = _mm_mul_ps(r32,ewtabscale);
831             ewitab           = _mm_cvttps_epi32(ewrt);
832             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
833             ewitab           = _mm_slli_epi32(ewitab,2);
834             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
835             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
836             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
837             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
838             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
839             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
840             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
841             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
842             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
843
844             d                = _mm_sub_ps(r32,rswitch);
845             d                = _mm_max_ps(d,_mm_setzero_ps());
846             d2               = _mm_mul_ps(d,d);
847             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
848
849             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
850
851             /* Evaluate switch function */
852             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
853             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
854             velec            = _mm_mul_ps(velec,sw);
855             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
856
857             /* Update potential sum for this i atom from the interaction with this j atom. */
858             velec            = _mm_and_ps(velec,cutoff_mask);
859             velecsum         = _mm_add_ps(velecsum,velec);
860
861             fscal            = felec;
862
863             fscal            = _mm_and_ps(fscal,cutoff_mask);
864
865             /* Calculate temporary vectorial force */
866             tx               = _mm_mul_ps(fscal,dx32);
867             ty               = _mm_mul_ps(fscal,dy32);
868             tz               = _mm_mul_ps(fscal,dz32);
869
870             /* Update vectorial force */
871             fix3             = _mm_add_ps(fix3,tx);
872             fiy3             = _mm_add_ps(fiy3,ty);
873             fiz3             = _mm_add_ps(fiz3,tz);
874
875             fjx2             = _mm_add_ps(fjx2,tx);
876             fjy2             = _mm_add_ps(fjy2,ty);
877             fjz2             = _mm_add_ps(fjz2,tz);
878             
879             }
880
881             /**************************
882              * CALCULATE INTERACTIONS *
883              **************************/
884
885             if (gmx_mm_any_lt(rsq33,rcutoff2))
886             {
887
888             r33              = _mm_mul_ps(rsq33,rinv33);
889
890             /* EWALD ELECTROSTATICS */
891
892             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
893             ewrt             = _mm_mul_ps(r33,ewtabscale);
894             ewitab           = _mm_cvttps_epi32(ewrt);
895             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
896             ewitab           = _mm_slli_epi32(ewitab,2);
897             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
898             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
899             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
900             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
901             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
902             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
903             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
904             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
905             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
906
907             d                = _mm_sub_ps(r33,rswitch);
908             d                = _mm_max_ps(d,_mm_setzero_ps());
909             d2               = _mm_mul_ps(d,d);
910             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
911
912             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
913
914             /* Evaluate switch function */
915             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
916             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
917             velec            = _mm_mul_ps(velec,sw);
918             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
919
920             /* Update potential sum for this i atom from the interaction with this j atom. */
921             velec            = _mm_and_ps(velec,cutoff_mask);
922             velecsum         = _mm_add_ps(velecsum,velec);
923
924             fscal            = felec;
925
926             fscal            = _mm_and_ps(fscal,cutoff_mask);
927
928             /* Calculate temporary vectorial force */
929             tx               = _mm_mul_ps(fscal,dx33);
930             ty               = _mm_mul_ps(fscal,dy33);
931             tz               = _mm_mul_ps(fscal,dz33);
932
933             /* Update vectorial force */
934             fix3             = _mm_add_ps(fix3,tx);
935             fiy3             = _mm_add_ps(fiy3,ty);
936             fiz3             = _mm_add_ps(fiz3,tz);
937
938             fjx3             = _mm_add_ps(fjx3,tx);
939             fjy3             = _mm_add_ps(fjy3,ty);
940             fjz3             = _mm_add_ps(fjz3,tz);
941             
942             }
943
944             fjptrA             = f+j_coord_offsetA;
945             fjptrB             = f+j_coord_offsetB;
946             fjptrC             = f+j_coord_offsetC;
947             fjptrD             = f+j_coord_offsetD;
948
949             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
950                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
951                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
952
953             /* Inner loop uses 647 flops */
954         }
955
956         if(jidx<j_index_end)
957         {
958
959             /* Get j neighbor index, and coordinate index */
960             jnrlistA         = jjnr[jidx];
961             jnrlistB         = jjnr[jidx+1];
962             jnrlistC         = jjnr[jidx+2];
963             jnrlistD         = jjnr[jidx+3];
964             /* Sign of each element will be negative for non-real atoms.
965              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
966              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
967              */
968             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
969             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
970             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
971             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
972             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
973             j_coord_offsetA  = DIM*jnrA;
974             j_coord_offsetB  = DIM*jnrB;
975             j_coord_offsetC  = DIM*jnrC;
976             j_coord_offsetD  = DIM*jnrD;
977
978             /* load j atom coordinates */
979             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
980                                               x+j_coord_offsetC,x+j_coord_offsetD,
981                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
982                                               &jy2,&jz2,&jx3,&jy3,&jz3);
983
984             /* Calculate displacement vector */
985             dx00             = _mm_sub_ps(ix0,jx0);
986             dy00             = _mm_sub_ps(iy0,jy0);
987             dz00             = _mm_sub_ps(iz0,jz0);
988             dx11             = _mm_sub_ps(ix1,jx1);
989             dy11             = _mm_sub_ps(iy1,jy1);
990             dz11             = _mm_sub_ps(iz1,jz1);
991             dx12             = _mm_sub_ps(ix1,jx2);
992             dy12             = _mm_sub_ps(iy1,jy2);
993             dz12             = _mm_sub_ps(iz1,jz2);
994             dx13             = _mm_sub_ps(ix1,jx3);
995             dy13             = _mm_sub_ps(iy1,jy3);
996             dz13             = _mm_sub_ps(iz1,jz3);
997             dx21             = _mm_sub_ps(ix2,jx1);
998             dy21             = _mm_sub_ps(iy2,jy1);
999             dz21             = _mm_sub_ps(iz2,jz1);
1000             dx22             = _mm_sub_ps(ix2,jx2);
1001             dy22             = _mm_sub_ps(iy2,jy2);
1002             dz22             = _mm_sub_ps(iz2,jz2);
1003             dx23             = _mm_sub_ps(ix2,jx3);
1004             dy23             = _mm_sub_ps(iy2,jy3);
1005             dz23             = _mm_sub_ps(iz2,jz3);
1006             dx31             = _mm_sub_ps(ix3,jx1);
1007             dy31             = _mm_sub_ps(iy3,jy1);
1008             dz31             = _mm_sub_ps(iz3,jz1);
1009             dx32             = _mm_sub_ps(ix3,jx2);
1010             dy32             = _mm_sub_ps(iy3,jy2);
1011             dz32             = _mm_sub_ps(iz3,jz2);
1012             dx33             = _mm_sub_ps(ix3,jx3);
1013             dy33             = _mm_sub_ps(iy3,jy3);
1014             dz33             = _mm_sub_ps(iz3,jz3);
1015
1016             /* Calculate squared distance and things based on it */
1017             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1018             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1019             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1020             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
1021             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1022             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1023             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
1024             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
1025             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
1026             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
1027
1028             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1029             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1030             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1031             rinv13           = gmx_mm_invsqrt_ps(rsq13);
1032             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1033             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1034             rinv23           = gmx_mm_invsqrt_ps(rsq23);
1035             rinv31           = gmx_mm_invsqrt_ps(rsq31);
1036             rinv32           = gmx_mm_invsqrt_ps(rsq32);
1037             rinv33           = gmx_mm_invsqrt_ps(rsq33);
1038
1039             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1040             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1041             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1042             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
1043             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1044             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1045             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
1046             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
1047             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
1048             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
1049
1050             fjx0             = _mm_setzero_ps();
1051             fjy0             = _mm_setzero_ps();
1052             fjz0             = _mm_setzero_ps();
1053             fjx1             = _mm_setzero_ps();
1054             fjy1             = _mm_setzero_ps();
1055             fjz1             = _mm_setzero_ps();
1056             fjx2             = _mm_setzero_ps();
1057             fjy2             = _mm_setzero_ps();
1058             fjz2             = _mm_setzero_ps();
1059             fjx3             = _mm_setzero_ps();
1060             fjy3             = _mm_setzero_ps();
1061             fjz3             = _mm_setzero_ps();
1062
1063             /**************************
1064              * CALCULATE INTERACTIONS *
1065              **************************/
1066
1067             if (gmx_mm_any_lt(rsq00,rcutoff2))
1068             {
1069
1070             r00              = _mm_mul_ps(rsq00,rinv00);
1071             r00              = _mm_andnot_ps(dummy_mask,r00);
1072
1073             /* LENNARD-JONES DISPERSION/REPULSION */
1074
1075             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1076             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1077             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1078             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1079             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1080
1081             d                = _mm_sub_ps(r00,rswitch);
1082             d                = _mm_max_ps(d,_mm_setzero_ps());
1083             d2               = _mm_mul_ps(d,d);
1084             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1085
1086             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1087
1088             /* Evaluate switch function */
1089             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1090             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1091             vvdw             = _mm_mul_ps(vvdw,sw);
1092             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1093
1094             /* Update potential sum for this i atom from the interaction with this j atom. */
1095             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
1096             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
1097             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
1098
1099             fscal            = fvdw;
1100
1101             fscal            = _mm_and_ps(fscal,cutoff_mask);
1102
1103             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1104
1105             /* Calculate temporary vectorial force */
1106             tx               = _mm_mul_ps(fscal,dx00);
1107             ty               = _mm_mul_ps(fscal,dy00);
1108             tz               = _mm_mul_ps(fscal,dz00);
1109
1110             /* Update vectorial force */
1111             fix0             = _mm_add_ps(fix0,tx);
1112             fiy0             = _mm_add_ps(fiy0,ty);
1113             fiz0             = _mm_add_ps(fiz0,tz);
1114
1115             fjx0             = _mm_add_ps(fjx0,tx);
1116             fjy0             = _mm_add_ps(fjy0,ty);
1117             fjz0             = _mm_add_ps(fjz0,tz);
1118             
1119             }
1120
1121             /**************************
1122              * CALCULATE INTERACTIONS *
1123              **************************/
1124
1125             if (gmx_mm_any_lt(rsq11,rcutoff2))
1126             {
1127
1128             r11              = _mm_mul_ps(rsq11,rinv11);
1129             r11              = _mm_andnot_ps(dummy_mask,r11);
1130
1131             /* EWALD ELECTROSTATICS */
1132
1133             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1134             ewrt             = _mm_mul_ps(r11,ewtabscale);
1135             ewitab           = _mm_cvttps_epi32(ewrt);
1136             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1137             ewitab           = _mm_slli_epi32(ewitab,2);
1138             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1139             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1140             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1141             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1142             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1143             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1144             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1145             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1146             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1147
1148             d                = _mm_sub_ps(r11,rswitch);
1149             d                = _mm_max_ps(d,_mm_setzero_ps());
1150             d2               = _mm_mul_ps(d,d);
1151             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1152
1153             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1154
1155             /* Evaluate switch function */
1156             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1157             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1158             velec            = _mm_mul_ps(velec,sw);
1159             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1160
1161             /* Update potential sum for this i atom from the interaction with this j atom. */
1162             velec            = _mm_and_ps(velec,cutoff_mask);
1163             velec            = _mm_andnot_ps(dummy_mask,velec);
1164             velecsum         = _mm_add_ps(velecsum,velec);
1165
1166             fscal            = felec;
1167
1168             fscal            = _mm_and_ps(fscal,cutoff_mask);
1169
1170             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1171
1172             /* Calculate temporary vectorial force */
1173             tx               = _mm_mul_ps(fscal,dx11);
1174             ty               = _mm_mul_ps(fscal,dy11);
1175             tz               = _mm_mul_ps(fscal,dz11);
1176
1177             /* Update vectorial force */
1178             fix1             = _mm_add_ps(fix1,tx);
1179             fiy1             = _mm_add_ps(fiy1,ty);
1180             fiz1             = _mm_add_ps(fiz1,tz);
1181
1182             fjx1             = _mm_add_ps(fjx1,tx);
1183             fjy1             = _mm_add_ps(fjy1,ty);
1184             fjz1             = _mm_add_ps(fjz1,tz);
1185             
1186             }
1187
1188             /**************************
1189              * CALCULATE INTERACTIONS *
1190              **************************/
1191
1192             if (gmx_mm_any_lt(rsq12,rcutoff2))
1193             {
1194
1195             r12              = _mm_mul_ps(rsq12,rinv12);
1196             r12              = _mm_andnot_ps(dummy_mask,r12);
1197
1198             /* EWALD ELECTROSTATICS */
1199
1200             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1201             ewrt             = _mm_mul_ps(r12,ewtabscale);
1202             ewitab           = _mm_cvttps_epi32(ewrt);
1203             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1204             ewitab           = _mm_slli_epi32(ewitab,2);
1205             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1206             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1207             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1208             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1209             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1210             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1211             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1212             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1213             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1214
1215             d                = _mm_sub_ps(r12,rswitch);
1216             d                = _mm_max_ps(d,_mm_setzero_ps());
1217             d2               = _mm_mul_ps(d,d);
1218             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1219
1220             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1221
1222             /* Evaluate switch function */
1223             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1224             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1225             velec            = _mm_mul_ps(velec,sw);
1226             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1227
1228             /* Update potential sum for this i atom from the interaction with this j atom. */
1229             velec            = _mm_and_ps(velec,cutoff_mask);
1230             velec            = _mm_andnot_ps(dummy_mask,velec);
1231             velecsum         = _mm_add_ps(velecsum,velec);
1232
1233             fscal            = felec;
1234
1235             fscal            = _mm_and_ps(fscal,cutoff_mask);
1236
1237             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1238
1239             /* Calculate temporary vectorial force */
1240             tx               = _mm_mul_ps(fscal,dx12);
1241             ty               = _mm_mul_ps(fscal,dy12);
1242             tz               = _mm_mul_ps(fscal,dz12);
1243
1244             /* Update vectorial force */
1245             fix1             = _mm_add_ps(fix1,tx);
1246             fiy1             = _mm_add_ps(fiy1,ty);
1247             fiz1             = _mm_add_ps(fiz1,tz);
1248
1249             fjx2             = _mm_add_ps(fjx2,tx);
1250             fjy2             = _mm_add_ps(fjy2,ty);
1251             fjz2             = _mm_add_ps(fjz2,tz);
1252             
1253             }
1254
1255             /**************************
1256              * CALCULATE INTERACTIONS *
1257              **************************/
1258
1259             if (gmx_mm_any_lt(rsq13,rcutoff2))
1260             {
1261
1262             r13              = _mm_mul_ps(rsq13,rinv13);
1263             r13              = _mm_andnot_ps(dummy_mask,r13);
1264
1265             /* EWALD ELECTROSTATICS */
1266
1267             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1268             ewrt             = _mm_mul_ps(r13,ewtabscale);
1269             ewitab           = _mm_cvttps_epi32(ewrt);
1270             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1271             ewitab           = _mm_slli_epi32(ewitab,2);
1272             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1273             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1274             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1275             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1276             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1277             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1278             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1279             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
1280             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
1281
1282             d                = _mm_sub_ps(r13,rswitch);
1283             d                = _mm_max_ps(d,_mm_setzero_ps());
1284             d2               = _mm_mul_ps(d,d);
1285             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1286
1287             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1288
1289             /* Evaluate switch function */
1290             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1291             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
1292             velec            = _mm_mul_ps(velec,sw);
1293             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
1294
1295             /* Update potential sum for this i atom from the interaction with this j atom. */
1296             velec            = _mm_and_ps(velec,cutoff_mask);
1297             velec            = _mm_andnot_ps(dummy_mask,velec);
1298             velecsum         = _mm_add_ps(velecsum,velec);
1299
1300             fscal            = felec;
1301
1302             fscal            = _mm_and_ps(fscal,cutoff_mask);
1303
1304             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1305
1306             /* Calculate temporary vectorial force */
1307             tx               = _mm_mul_ps(fscal,dx13);
1308             ty               = _mm_mul_ps(fscal,dy13);
1309             tz               = _mm_mul_ps(fscal,dz13);
1310
1311             /* Update vectorial force */
1312             fix1             = _mm_add_ps(fix1,tx);
1313             fiy1             = _mm_add_ps(fiy1,ty);
1314             fiz1             = _mm_add_ps(fiz1,tz);
1315
1316             fjx3             = _mm_add_ps(fjx3,tx);
1317             fjy3             = _mm_add_ps(fjy3,ty);
1318             fjz3             = _mm_add_ps(fjz3,tz);
1319             
1320             }
1321
1322             /**************************
1323              * CALCULATE INTERACTIONS *
1324              **************************/
1325
1326             if (gmx_mm_any_lt(rsq21,rcutoff2))
1327             {
1328
1329             r21              = _mm_mul_ps(rsq21,rinv21);
1330             r21              = _mm_andnot_ps(dummy_mask,r21);
1331
1332             /* EWALD ELECTROSTATICS */
1333
1334             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1335             ewrt             = _mm_mul_ps(r21,ewtabscale);
1336             ewitab           = _mm_cvttps_epi32(ewrt);
1337             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1338             ewitab           = _mm_slli_epi32(ewitab,2);
1339             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1340             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1341             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1342             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1343             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1344             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1345             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1346             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1347             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1348
1349             d                = _mm_sub_ps(r21,rswitch);
1350             d                = _mm_max_ps(d,_mm_setzero_ps());
1351             d2               = _mm_mul_ps(d,d);
1352             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1353
1354             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1355
1356             /* Evaluate switch function */
1357             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1358             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1359             velec            = _mm_mul_ps(velec,sw);
1360             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1361
1362             /* Update potential sum for this i atom from the interaction with this j atom. */
1363             velec            = _mm_and_ps(velec,cutoff_mask);
1364             velec            = _mm_andnot_ps(dummy_mask,velec);
1365             velecsum         = _mm_add_ps(velecsum,velec);
1366
1367             fscal            = felec;
1368
1369             fscal            = _mm_and_ps(fscal,cutoff_mask);
1370
1371             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1372
1373             /* Calculate temporary vectorial force */
1374             tx               = _mm_mul_ps(fscal,dx21);
1375             ty               = _mm_mul_ps(fscal,dy21);
1376             tz               = _mm_mul_ps(fscal,dz21);
1377
1378             /* Update vectorial force */
1379             fix2             = _mm_add_ps(fix2,tx);
1380             fiy2             = _mm_add_ps(fiy2,ty);
1381             fiz2             = _mm_add_ps(fiz2,tz);
1382
1383             fjx1             = _mm_add_ps(fjx1,tx);
1384             fjy1             = _mm_add_ps(fjy1,ty);
1385             fjz1             = _mm_add_ps(fjz1,tz);
1386             
1387             }
1388
1389             /**************************
1390              * CALCULATE INTERACTIONS *
1391              **************************/
1392
1393             if (gmx_mm_any_lt(rsq22,rcutoff2))
1394             {
1395
1396             r22              = _mm_mul_ps(rsq22,rinv22);
1397             r22              = _mm_andnot_ps(dummy_mask,r22);
1398
1399             /* EWALD ELECTROSTATICS */
1400
1401             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1402             ewrt             = _mm_mul_ps(r22,ewtabscale);
1403             ewitab           = _mm_cvttps_epi32(ewrt);
1404             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1405             ewitab           = _mm_slli_epi32(ewitab,2);
1406             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1407             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1408             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1409             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1410             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1411             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1412             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1413             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1414             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1415
1416             d                = _mm_sub_ps(r22,rswitch);
1417             d                = _mm_max_ps(d,_mm_setzero_ps());
1418             d2               = _mm_mul_ps(d,d);
1419             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1420
1421             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1422
1423             /* Evaluate switch function */
1424             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1425             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1426             velec            = _mm_mul_ps(velec,sw);
1427             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1428
1429             /* Update potential sum for this i atom from the interaction with this j atom. */
1430             velec            = _mm_and_ps(velec,cutoff_mask);
1431             velec            = _mm_andnot_ps(dummy_mask,velec);
1432             velecsum         = _mm_add_ps(velecsum,velec);
1433
1434             fscal            = felec;
1435
1436             fscal            = _mm_and_ps(fscal,cutoff_mask);
1437
1438             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1439
1440             /* Calculate temporary vectorial force */
1441             tx               = _mm_mul_ps(fscal,dx22);
1442             ty               = _mm_mul_ps(fscal,dy22);
1443             tz               = _mm_mul_ps(fscal,dz22);
1444
1445             /* Update vectorial force */
1446             fix2             = _mm_add_ps(fix2,tx);
1447             fiy2             = _mm_add_ps(fiy2,ty);
1448             fiz2             = _mm_add_ps(fiz2,tz);
1449
1450             fjx2             = _mm_add_ps(fjx2,tx);
1451             fjy2             = _mm_add_ps(fjy2,ty);
1452             fjz2             = _mm_add_ps(fjz2,tz);
1453             
1454             }
1455
1456             /**************************
1457              * CALCULATE INTERACTIONS *
1458              **************************/
1459
1460             if (gmx_mm_any_lt(rsq23,rcutoff2))
1461             {
1462
1463             r23              = _mm_mul_ps(rsq23,rinv23);
1464             r23              = _mm_andnot_ps(dummy_mask,r23);
1465
1466             /* EWALD ELECTROSTATICS */
1467
1468             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1469             ewrt             = _mm_mul_ps(r23,ewtabscale);
1470             ewitab           = _mm_cvttps_epi32(ewrt);
1471             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1472             ewitab           = _mm_slli_epi32(ewitab,2);
1473             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1474             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1475             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1476             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1477             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1478             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1479             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1480             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
1481             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
1482
1483             d                = _mm_sub_ps(r23,rswitch);
1484             d                = _mm_max_ps(d,_mm_setzero_ps());
1485             d2               = _mm_mul_ps(d,d);
1486             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1487
1488             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1489
1490             /* Evaluate switch function */
1491             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1492             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
1493             velec            = _mm_mul_ps(velec,sw);
1494             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
1495
1496             /* Update potential sum for this i atom from the interaction with this j atom. */
1497             velec            = _mm_and_ps(velec,cutoff_mask);
1498             velec            = _mm_andnot_ps(dummy_mask,velec);
1499             velecsum         = _mm_add_ps(velecsum,velec);
1500
1501             fscal            = felec;
1502
1503             fscal            = _mm_and_ps(fscal,cutoff_mask);
1504
1505             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1506
1507             /* Calculate temporary vectorial force */
1508             tx               = _mm_mul_ps(fscal,dx23);
1509             ty               = _mm_mul_ps(fscal,dy23);
1510             tz               = _mm_mul_ps(fscal,dz23);
1511
1512             /* Update vectorial force */
1513             fix2             = _mm_add_ps(fix2,tx);
1514             fiy2             = _mm_add_ps(fiy2,ty);
1515             fiz2             = _mm_add_ps(fiz2,tz);
1516
1517             fjx3             = _mm_add_ps(fjx3,tx);
1518             fjy3             = _mm_add_ps(fjy3,ty);
1519             fjz3             = _mm_add_ps(fjz3,tz);
1520             
1521             }
1522
1523             /**************************
1524              * CALCULATE INTERACTIONS *
1525              **************************/
1526
1527             if (gmx_mm_any_lt(rsq31,rcutoff2))
1528             {
1529
1530             r31              = _mm_mul_ps(rsq31,rinv31);
1531             r31              = _mm_andnot_ps(dummy_mask,r31);
1532
1533             /* EWALD ELECTROSTATICS */
1534
1535             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1536             ewrt             = _mm_mul_ps(r31,ewtabscale);
1537             ewitab           = _mm_cvttps_epi32(ewrt);
1538             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1539             ewitab           = _mm_slli_epi32(ewitab,2);
1540             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1541             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1542             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1543             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1544             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1545             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1546             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1547             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
1548             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
1549
1550             d                = _mm_sub_ps(r31,rswitch);
1551             d                = _mm_max_ps(d,_mm_setzero_ps());
1552             d2               = _mm_mul_ps(d,d);
1553             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1554
1555             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1556
1557             /* Evaluate switch function */
1558             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1559             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
1560             velec            = _mm_mul_ps(velec,sw);
1561             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
1562
1563             /* Update potential sum for this i atom from the interaction with this j atom. */
1564             velec            = _mm_and_ps(velec,cutoff_mask);
1565             velec            = _mm_andnot_ps(dummy_mask,velec);
1566             velecsum         = _mm_add_ps(velecsum,velec);
1567
1568             fscal            = felec;
1569
1570             fscal            = _mm_and_ps(fscal,cutoff_mask);
1571
1572             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1573
1574             /* Calculate temporary vectorial force */
1575             tx               = _mm_mul_ps(fscal,dx31);
1576             ty               = _mm_mul_ps(fscal,dy31);
1577             tz               = _mm_mul_ps(fscal,dz31);
1578
1579             /* Update vectorial force */
1580             fix3             = _mm_add_ps(fix3,tx);
1581             fiy3             = _mm_add_ps(fiy3,ty);
1582             fiz3             = _mm_add_ps(fiz3,tz);
1583
1584             fjx1             = _mm_add_ps(fjx1,tx);
1585             fjy1             = _mm_add_ps(fjy1,ty);
1586             fjz1             = _mm_add_ps(fjz1,tz);
1587             
1588             }
1589
1590             /**************************
1591              * CALCULATE INTERACTIONS *
1592              **************************/
1593
1594             if (gmx_mm_any_lt(rsq32,rcutoff2))
1595             {
1596
1597             r32              = _mm_mul_ps(rsq32,rinv32);
1598             r32              = _mm_andnot_ps(dummy_mask,r32);
1599
1600             /* EWALD ELECTROSTATICS */
1601
1602             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1603             ewrt             = _mm_mul_ps(r32,ewtabscale);
1604             ewitab           = _mm_cvttps_epi32(ewrt);
1605             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1606             ewitab           = _mm_slli_epi32(ewitab,2);
1607             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1608             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1609             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1610             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1611             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1612             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1613             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1614             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
1615             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
1616
1617             d                = _mm_sub_ps(r32,rswitch);
1618             d                = _mm_max_ps(d,_mm_setzero_ps());
1619             d2               = _mm_mul_ps(d,d);
1620             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1621
1622             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1623
1624             /* Evaluate switch function */
1625             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1626             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
1627             velec            = _mm_mul_ps(velec,sw);
1628             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
1629
1630             /* Update potential sum for this i atom from the interaction with this j atom. */
1631             velec            = _mm_and_ps(velec,cutoff_mask);
1632             velec            = _mm_andnot_ps(dummy_mask,velec);
1633             velecsum         = _mm_add_ps(velecsum,velec);
1634
1635             fscal            = felec;
1636
1637             fscal            = _mm_and_ps(fscal,cutoff_mask);
1638
1639             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1640
1641             /* Calculate temporary vectorial force */
1642             tx               = _mm_mul_ps(fscal,dx32);
1643             ty               = _mm_mul_ps(fscal,dy32);
1644             tz               = _mm_mul_ps(fscal,dz32);
1645
1646             /* Update vectorial force */
1647             fix3             = _mm_add_ps(fix3,tx);
1648             fiy3             = _mm_add_ps(fiy3,ty);
1649             fiz3             = _mm_add_ps(fiz3,tz);
1650
1651             fjx2             = _mm_add_ps(fjx2,tx);
1652             fjy2             = _mm_add_ps(fjy2,ty);
1653             fjz2             = _mm_add_ps(fjz2,tz);
1654             
1655             }
1656
1657             /**************************
1658              * CALCULATE INTERACTIONS *
1659              **************************/
1660
1661             if (gmx_mm_any_lt(rsq33,rcutoff2))
1662             {
1663
1664             r33              = _mm_mul_ps(rsq33,rinv33);
1665             r33              = _mm_andnot_ps(dummy_mask,r33);
1666
1667             /* EWALD ELECTROSTATICS */
1668
1669             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1670             ewrt             = _mm_mul_ps(r33,ewtabscale);
1671             ewitab           = _mm_cvttps_epi32(ewrt);
1672             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1673             ewitab           = _mm_slli_epi32(ewitab,2);
1674             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1675             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1676             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1677             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1678             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1679             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1680             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1681             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
1682             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
1683
1684             d                = _mm_sub_ps(r33,rswitch);
1685             d                = _mm_max_ps(d,_mm_setzero_ps());
1686             d2               = _mm_mul_ps(d,d);
1687             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1688
1689             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1690
1691             /* Evaluate switch function */
1692             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1693             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
1694             velec            = _mm_mul_ps(velec,sw);
1695             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
1696
1697             /* Update potential sum for this i atom from the interaction with this j atom. */
1698             velec            = _mm_and_ps(velec,cutoff_mask);
1699             velec            = _mm_andnot_ps(dummy_mask,velec);
1700             velecsum         = _mm_add_ps(velecsum,velec);
1701
1702             fscal            = felec;
1703
1704             fscal            = _mm_and_ps(fscal,cutoff_mask);
1705
1706             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1707
1708             /* Calculate temporary vectorial force */
1709             tx               = _mm_mul_ps(fscal,dx33);
1710             ty               = _mm_mul_ps(fscal,dy33);
1711             tz               = _mm_mul_ps(fscal,dz33);
1712
1713             /* Update vectorial force */
1714             fix3             = _mm_add_ps(fix3,tx);
1715             fiy3             = _mm_add_ps(fiy3,ty);
1716             fiz3             = _mm_add_ps(fiz3,tz);
1717
1718             fjx3             = _mm_add_ps(fjx3,tx);
1719             fjy3             = _mm_add_ps(fjy3,ty);
1720             fjz3             = _mm_add_ps(fjz3,tz);
1721             
1722             }
1723
1724             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1725             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1726             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1727             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1728
1729             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1730                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1731                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1732
1733             /* Inner loop uses 657 flops */
1734         }
1735
1736         /* End of innermost loop */
1737
1738         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1739                                               f+i_coord_offset,fshift+i_shift_offset);
1740
1741         ggid                        = gid[iidx];
1742         /* Update potential energies */
1743         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1744         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1745
1746         /* Increment number of inner iterations */
1747         inneriter                  += j_index_end - j_index_start;
1748
1749         /* Outer loop uses 26 flops */
1750     }
1751
1752     /* Increment number of outer iterations */
1753     outeriter        += nri;
1754
1755     /* Update outer/inner flops */
1756
1757     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*657);
1758 }
1759 /*
1760  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single
1761  * Electrostatics interaction: Ewald
1762  * VdW interaction:            LennardJones
1763  * Geometry:                   Water4-Water4
1764  * Calculate force/pot:        Force
1765  */
1766 void
1767 nb_kernel_ElecEwSw_VdwLJSw_GeomW4W4_F_sse2_single
1768                     (t_nblist * gmx_restrict                nlist,
1769                      rvec * gmx_restrict                    xx,
1770                      rvec * gmx_restrict                    ff,
1771                      t_forcerec * gmx_restrict              fr,
1772                      t_mdatoms * gmx_restrict               mdatoms,
1773                      nb_kernel_data_t * gmx_restrict        kernel_data,
1774                      t_nrnb * gmx_restrict                  nrnb)
1775 {
1776     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1777      * just 0 for non-waters.
1778      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1779      * jnr indices corresponding to data put in the four positions in the SIMD register.
1780      */
1781     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1782     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1783     int              jnrA,jnrB,jnrC,jnrD;
1784     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1785     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1786     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1787     real             rcutoff_scalar;
1788     real             *shiftvec,*fshift,*x,*f;
1789     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1790     real             scratch[4*DIM];
1791     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1792     int              vdwioffset0;
1793     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1794     int              vdwioffset1;
1795     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1796     int              vdwioffset2;
1797     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1798     int              vdwioffset3;
1799     __m128           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1800     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1801     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1802     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1803     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1804     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1805     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1806     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1807     __m128           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1808     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1809     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1810     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1811     __m128           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1812     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1813     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1814     __m128           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1815     __m128           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1816     __m128           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1817     __m128           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1818     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1819     real             *charge;
1820     int              nvdwtype;
1821     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1822     int              *vdwtype;
1823     real             *vdwparam;
1824     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1825     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1826     __m128i          ewitab;
1827     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1828     real             *ewtab;
1829     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1830     real             rswitch_scalar,d_scalar;
1831     __m128           dummy_mask,cutoff_mask;
1832     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1833     __m128           one     = _mm_set1_ps(1.0);
1834     __m128           two     = _mm_set1_ps(2.0);
1835     x                = xx[0];
1836     f                = ff[0];
1837
1838     nri              = nlist->nri;
1839     iinr             = nlist->iinr;
1840     jindex           = nlist->jindex;
1841     jjnr             = nlist->jjnr;
1842     shiftidx         = nlist->shift;
1843     gid              = nlist->gid;
1844     shiftvec         = fr->shift_vec[0];
1845     fshift           = fr->fshift[0];
1846     facel            = _mm_set1_ps(fr->epsfac);
1847     charge           = mdatoms->chargeA;
1848     nvdwtype         = fr->ntype;
1849     vdwparam         = fr->nbfp;
1850     vdwtype          = mdatoms->typeA;
1851
1852     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1853     ewtab            = fr->ic->tabq_coul_FDV0;
1854     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1855     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1856
1857     /* Setup water-specific parameters */
1858     inr              = nlist->iinr[0];
1859     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1860     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1861     iq3              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+3]));
1862     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1863
1864     jq1              = _mm_set1_ps(charge[inr+1]);
1865     jq2              = _mm_set1_ps(charge[inr+2]);
1866     jq3              = _mm_set1_ps(charge[inr+3]);
1867     vdwjidx0A        = 2*vdwtype[inr+0];
1868     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1869     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1870     qq11             = _mm_mul_ps(iq1,jq1);
1871     qq12             = _mm_mul_ps(iq1,jq2);
1872     qq13             = _mm_mul_ps(iq1,jq3);
1873     qq21             = _mm_mul_ps(iq2,jq1);
1874     qq22             = _mm_mul_ps(iq2,jq2);
1875     qq23             = _mm_mul_ps(iq2,jq3);
1876     qq31             = _mm_mul_ps(iq3,jq1);
1877     qq32             = _mm_mul_ps(iq3,jq2);
1878     qq33             = _mm_mul_ps(iq3,jq3);
1879
1880     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1881     rcutoff_scalar   = fr->rcoulomb;
1882     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1883     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1884
1885     rswitch_scalar   = fr->rcoulomb_switch;
1886     rswitch          = _mm_set1_ps(rswitch_scalar);
1887     /* Setup switch parameters */
1888     d_scalar         = rcutoff_scalar-rswitch_scalar;
1889     d                = _mm_set1_ps(d_scalar);
1890     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1891     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1892     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1893     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1894     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1895     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1896
1897     /* Avoid stupid compiler warnings */
1898     jnrA = jnrB = jnrC = jnrD = 0;
1899     j_coord_offsetA = 0;
1900     j_coord_offsetB = 0;
1901     j_coord_offsetC = 0;
1902     j_coord_offsetD = 0;
1903
1904     outeriter        = 0;
1905     inneriter        = 0;
1906
1907     for(iidx=0;iidx<4*DIM;iidx++)
1908     {
1909         scratch[iidx] = 0.0;
1910     }  
1911
1912     /* Start outer loop over neighborlists */
1913     for(iidx=0; iidx<nri; iidx++)
1914     {
1915         /* Load shift vector for this list */
1916         i_shift_offset   = DIM*shiftidx[iidx];
1917
1918         /* Load limits for loop over neighbors */
1919         j_index_start    = jindex[iidx];
1920         j_index_end      = jindex[iidx+1];
1921
1922         /* Get outer coordinate index */
1923         inr              = iinr[iidx];
1924         i_coord_offset   = DIM*inr;
1925
1926         /* Load i particle coords and add shift vector */
1927         gmx_mm_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1928                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1929         
1930         fix0             = _mm_setzero_ps();
1931         fiy0             = _mm_setzero_ps();
1932         fiz0             = _mm_setzero_ps();
1933         fix1             = _mm_setzero_ps();
1934         fiy1             = _mm_setzero_ps();
1935         fiz1             = _mm_setzero_ps();
1936         fix2             = _mm_setzero_ps();
1937         fiy2             = _mm_setzero_ps();
1938         fiz2             = _mm_setzero_ps();
1939         fix3             = _mm_setzero_ps();
1940         fiy3             = _mm_setzero_ps();
1941         fiz3             = _mm_setzero_ps();
1942
1943         /* Start inner kernel loop */
1944         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1945         {
1946
1947             /* Get j neighbor index, and coordinate index */
1948             jnrA             = jjnr[jidx];
1949             jnrB             = jjnr[jidx+1];
1950             jnrC             = jjnr[jidx+2];
1951             jnrD             = jjnr[jidx+3];
1952             j_coord_offsetA  = DIM*jnrA;
1953             j_coord_offsetB  = DIM*jnrB;
1954             j_coord_offsetC  = DIM*jnrC;
1955             j_coord_offsetD  = DIM*jnrD;
1956
1957             /* load j atom coordinates */
1958             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1959                                               x+j_coord_offsetC,x+j_coord_offsetD,
1960                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1961                                               &jy2,&jz2,&jx3,&jy3,&jz3);
1962
1963             /* Calculate displacement vector */
1964             dx00             = _mm_sub_ps(ix0,jx0);
1965             dy00             = _mm_sub_ps(iy0,jy0);
1966             dz00             = _mm_sub_ps(iz0,jz0);
1967             dx11             = _mm_sub_ps(ix1,jx1);
1968             dy11             = _mm_sub_ps(iy1,jy1);
1969             dz11             = _mm_sub_ps(iz1,jz1);
1970             dx12             = _mm_sub_ps(ix1,jx2);
1971             dy12             = _mm_sub_ps(iy1,jy2);
1972             dz12             = _mm_sub_ps(iz1,jz2);
1973             dx13             = _mm_sub_ps(ix1,jx3);
1974             dy13             = _mm_sub_ps(iy1,jy3);
1975             dz13             = _mm_sub_ps(iz1,jz3);
1976             dx21             = _mm_sub_ps(ix2,jx1);
1977             dy21             = _mm_sub_ps(iy2,jy1);
1978             dz21             = _mm_sub_ps(iz2,jz1);
1979             dx22             = _mm_sub_ps(ix2,jx2);
1980             dy22             = _mm_sub_ps(iy2,jy2);
1981             dz22             = _mm_sub_ps(iz2,jz2);
1982             dx23             = _mm_sub_ps(ix2,jx3);
1983             dy23             = _mm_sub_ps(iy2,jy3);
1984             dz23             = _mm_sub_ps(iz2,jz3);
1985             dx31             = _mm_sub_ps(ix3,jx1);
1986             dy31             = _mm_sub_ps(iy3,jy1);
1987             dz31             = _mm_sub_ps(iz3,jz1);
1988             dx32             = _mm_sub_ps(ix3,jx2);
1989             dy32             = _mm_sub_ps(iy3,jy2);
1990             dz32             = _mm_sub_ps(iz3,jz2);
1991             dx33             = _mm_sub_ps(ix3,jx3);
1992             dy33             = _mm_sub_ps(iy3,jy3);
1993             dz33             = _mm_sub_ps(iz3,jz3);
1994
1995             /* Calculate squared distance and things based on it */
1996             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1997             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1998             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1999             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2000             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2001             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2002             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2003             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2004             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2005             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2006
2007             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2008             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2009             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2010             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2011             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2012             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2013             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2014             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2015             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2016             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2017
2018             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2019             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2020             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2021             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2022             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2023             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2024             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2025             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2026             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2027             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2028
2029             fjx0             = _mm_setzero_ps();
2030             fjy0             = _mm_setzero_ps();
2031             fjz0             = _mm_setzero_ps();
2032             fjx1             = _mm_setzero_ps();
2033             fjy1             = _mm_setzero_ps();
2034             fjz1             = _mm_setzero_ps();
2035             fjx2             = _mm_setzero_ps();
2036             fjy2             = _mm_setzero_ps();
2037             fjz2             = _mm_setzero_ps();
2038             fjx3             = _mm_setzero_ps();
2039             fjy3             = _mm_setzero_ps();
2040             fjz3             = _mm_setzero_ps();
2041
2042             /**************************
2043              * CALCULATE INTERACTIONS *
2044              **************************/
2045
2046             if (gmx_mm_any_lt(rsq00,rcutoff2))
2047             {
2048
2049             r00              = _mm_mul_ps(rsq00,rinv00);
2050
2051             /* LENNARD-JONES DISPERSION/REPULSION */
2052
2053             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2054             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
2055             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2056             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2057             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2058
2059             d                = _mm_sub_ps(r00,rswitch);
2060             d                = _mm_max_ps(d,_mm_setzero_ps());
2061             d2               = _mm_mul_ps(d,d);
2062             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2063
2064             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2065
2066             /* Evaluate switch function */
2067             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2068             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2069             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2070
2071             fscal            = fvdw;
2072
2073             fscal            = _mm_and_ps(fscal,cutoff_mask);
2074
2075             /* Calculate temporary vectorial force */
2076             tx               = _mm_mul_ps(fscal,dx00);
2077             ty               = _mm_mul_ps(fscal,dy00);
2078             tz               = _mm_mul_ps(fscal,dz00);
2079
2080             /* Update vectorial force */
2081             fix0             = _mm_add_ps(fix0,tx);
2082             fiy0             = _mm_add_ps(fiy0,ty);
2083             fiz0             = _mm_add_ps(fiz0,tz);
2084
2085             fjx0             = _mm_add_ps(fjx0,tx);
2086             fjy0             = _mm_add_ps(fjy0,ty);
2087             fjz0             = _mm_add_ps(fjz0,tz);
2088             
2089             }
2090
2091             /**************************
2092              * CALCULATE INTERACTIONS *
2093              **************************/
2094
2095             if (gmx_mm_any_lt(rsq11,rcutoff2))
2096             {
2097
2098             r11              = _mm_mul_ps(rsq11,rinv11);
2099
2100             /* EWALD ELECTROSTATICS */
2101
2102             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2103             ewrt             = _mm_mul_ps(r11,ewtabscale);
2104             ewitab           = _mm_cvttps_epi32(ewrt);
2105             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2106             ewitab           = _mm_slli_epi32(ewitab,2);
2107             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2108             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2109             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2110             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2111             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2112             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2113             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2114             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2115             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2116
2117             d                = _mm_sub_ps(r11,rswitch);
2118             d                = _mm_max_ps(d,_mm_setzero_ps());
2119             d2               = _mm_mul_ps(d,d);
2120             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2121
2122             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2123
2124             /* Evaluate switch function */
2125             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2126             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2127             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2128
2129             fscal            = felec;
2130
2131             fscal            = _mm_and_ps(fscal,cutoff_mask);
2132
2133             /* Calculate temporary vectorial force */
2134             tx               = _mm_mul_ps(fscal,dx11);
2135             ty               = _mm_mul_ps(fscal,dy11);
2136             tz               = _mm_mul_ps(fscal,dz11);
2137
2138             /* Update vectorial force */
2139             fix1             = _mm_add_ps(fix1,tx);
2140             fiy1             = _mm_add_ps(fiy1,ty);
2141             fiz1             = _mm_add_ps(fiz1,tz);
2142
2143             fjx1             = _mm_add_ps(fjx1,tx);
2144             fjy1             = _mm_add_ps(fjy1,ty);
2145             fjz1             = _mm_add_ps(fjz1,tz);
2146             
2147             }
2148
2149             /**************************
2150              * CALCULATE INTERACTIONS *
2151              **************************/
2152
2153             if (gmx_mm_any_lt(rsq12,rcutoff2))
2154             {
2155
2156             r12              = _mm_mul_ps(rsq12,rinv12);
2157
2158             /* EWALD ELECTROSTATICS */
2159
2160             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2161             ewrt             = _mm_mul_ps(r12,ewtabscale);
2162             ewitab           = _mm_cvttps_epi32(ewrt);
2163             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2164             ewitab           = _mm_slli_epi32(ewitab,2);
2165             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2166             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2167             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2168             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2169             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2170             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2171             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2172             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2173             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2174
2175             d                = _mm_sub_ps(r12,rswitch);
2176             d                = _mm_max_ps(d,_mm_setzero_ps());
2177             d2               = _mm_mul_ps(d,d);
2178             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2179
2180             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2181
2182             /* Evaluate switch function */
2183             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2184             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2185             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2186
2187             fscal            = felec;
2188
2189             fscal            = _mm_and_ps(fscal,cutoff_mask);
2190
2191             /* Calculate temporary vectorial force */
2192             tx               = _mm_mul_ps(fscal,dx12);
2193             ty               = _mm_mul_ps(fscal,dy12);
2194             tz               = _mm_mul_ps(fscal,dz12);
2195
2196             /* Update vectorial force */
2197             fix1             = _mm_add_ps(fix1,tx);
2198             fiy1             = _mm_add_ps(fiy1,ty);
2199             fiz1             = _mm_add_ps(fiz1,tz);
2200
2201             fjx2             = _mm_add_ps(fjx2,tx);
2202             fjy2             = _mm_add_ps(fjy2,ty);
2203             fjz2             = _mm_add_ps(fjz2,tz);
2204             
2205             }
2206
2207             /**************************
2208              * CALCULATE INTERACTIONS *
2209              **************************/
2210
2211             if (gmx_mm_any_lt(rsq13,rcutoff2))
2212             {
2213
2214             r13              = _mm_mul_ps(rsq13,rinv13);
2215
2216             /* EWALD ELECTROSTATICS */
2217
2218             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2219             ewrt             = _mm_mul_ps(r13,ewtabscale);
2220             ewitab           = _mm_cvttps_epi32(ewrt);
2221             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2222             ewitab           = _mm_slli_epi32(ewitab,2);
2223             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2224             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2225             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2226             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2227             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2228             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2229             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2230             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2231             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2232
2233             d                = _mm_sub_ps(r13,rswitch);
2234             d                = _mm_max_ps(d,_mm_setzero_ps());
2235             d2               = _mm_mul_ps(d,d);
2236             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2237
2238             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2239
2240             /* Evaluate switch function */
2241             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2242             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2243             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2244
2245             fscal            = felec;
2246
2247             fscal            = _mm_and_ps(fscal,cutoff_mask);
2248
2249             /* Calculate temporary vectorial force */
2250             tx               = _mm_mul_ps(fscal,dx13);
2251             ty               = _mm_mul_ps(fscal,dy13);
2252             tz               = _mm_mul_ps(fscal,dz13);
2253
2254             /* Update vectorial force */
2255             fix1             = _mm_add_ps(fix1,tx);
2256             fiy1             = _mm_add_ps(fiy1,ty);
2257             fiz1             = _mm_add_ps(fiz1,tz);
2258
2259             fjx3             = _mm_add_ps(fjx3,tx);
2260             fjy3             = _mm_add_ps(fjy3,ty);
2261             fjz3             = _mm_add_ps(fjz3,tz);
2262             
2263             }
2264
2265             /**************************
2266              * CALCULATE INTERACTIONS *
2267              **************************/
2268
2269             if (gmx_mm_any_lt(rsq21,rcutoff2))
2270             {
2271
2272             r21              = _mm_mul_ps(rsq21,rinv21);
2273
2274             /* EWALD ELECTROSTATICS */
2275
2276             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2277             ewrt             = _mm_mul_ps(r21,ewtabscale);
2278             ewitab           = _mm_cvttps_epi32(ewrt);
2279             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2280             ewitab           = _mm_slli_epi32(ewitab,2);
2281             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2282             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2283             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2284             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2285             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2286             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2287             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2288             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2289             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2290
2291             d                = _mm_sub_ps(r21,rswitch);
2292             d                = _mm_max_ps(d,_mm_setzero_ps());
2293             d2               = _mm_mul_ps(d,d);
2294             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2295
2296             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2297
2298             /* Evaluate switch function */
2299             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2300             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2301             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2302
2303             fscal            = felec;
2304
2305             fscal            = _mm_and_ps(fscal,cutoff_mask);
2306
2307             /* Calculate temporary vectorial force */
2308             tx               = _mm_mul_ps(fscal,dx21);
2309             ty               = _mm_mul_ps(fscal,dy21);
2310             tz               = _mm_mul_ps(fscal,dz21);
2311
2312             /* Update vectorial force */
2313             fix2             = _mm_add_ps(fix2,tx);
2314             fiy2             = _mm_add_ps(fiy2,ty);
2315             fiz2             = _mm_add_ps(fiz2,tz);
2316
2317             fjx1             = _mm_add_ps(fjx1,tx);
2318             fjy1             = _mm_add_ps(fjy1,ty);
2319             fjz1             = _mm_add_ps(fjz1,tz);
2320             
2321             }
2322
2323             /**************************
2324              * CALCULATE INTERACTIONS *
2325              **************************/
2326
2327             if (gmx_mm_any_lt(rsq22,rcutoff2))
2328             {
2329
2330             r22              = _mm_mul_ps(rsq22,rinv22);
2331
2332             /* EWALD ELECTROSTATICS */
2333
2334             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2335             ewrt             = _mm_mul_ps(r22,ewtabscale);
2336             ewitab           = _mm_cvttps_epi32(ewrt);
2337             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2338             ewitab           = _mm_slli_epi32(ewitab,2);
2339             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2340             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2341             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2342             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2343             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2344             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2345             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2346             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2347             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2348
2349             d                = _mm_sub_ps(r22,rswitch);
2350             d                = _mm_max_ps(d,_mm_setzero_ps());
2351             d2               = _mm_mul_ps(d,d);
2352             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2353
2354             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2355
2356             /* Evaluate switch function */
2357             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2358             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2359             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2360
2361             fscal            = felec;
2362
2363             fscal            = _mm_and_ps(fscal,cutoff_mask);
2364
2365             /* Calculate temporary vectorial force */
2366             tx               = _mm_mul_ps(fscal,dx22);
2367             ty               = _mm_mul_ps(fscal,dy22);
2368             tz               = _mm_mul_ps(fscal,dz22);
2369
2370             /* Update vectorial force */
2371             fix2             = _mm_add_ps(fix2,tx);
2372             fiy2             = _mm_add_ps(fiy2,ty);
2373             fiz2             = _mm_add_ps(fiz2,tz);
2374
2375             fjx2             = _mm_add_ps(fjx2,tx);
2376             fjy2             = _mm_add_ps(fjy2,ty);
2377             fjz2             = _mm_add_ps(fjz2,tz);
2378             
2379             }
2380
2381             /**************************
2382              * CALCULATE INTERACTIONS *
2383              **************************/
2384
2385             if (gmx_mm_any_lt(rsq23,rcutoff2))
2386             {
2387
2388             r23              = _mm_mul_ps(rsq23,rinv23);
2389
2390             /* EWALD ELECTROSTATICS */
2391
2392             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2393             ewrt             = _mm_mul_ps(r23,ewtabscale);
2394             ewitab           = _mm_cvttps_epi32(ewrt);
2395             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2396             ewitab           = _mm_slli_epi32(ewitab,2);
2397             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2398             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2399             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2400             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2401             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2402             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2403             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2404             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
2405             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
2406
2407             d                = _mm_sub_ps(r23,rswitch);
2408             d                = _mm_max_ps(d,_mm_setzero_ps());
2409             d2               = _mm_mul_ps(d,d);
2410             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2411
2412             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2413
2414             /* Evaluate switch function */
2415             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2416             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
2417             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
2418
2419             fscal            = felec;
2420
2421             fscal            = _mm_and_ps(fscal,cutoff_mask);
2422
2423             /* Calculate temporary vectorial force */
2424             tx               = _mm_mul_ps(fscal,dx23);
2425             ty               = _mm_mul_ps(fscal,dy23);
2426             tz               = _mm_mul_ps(fscal,dz23);
2427
2428             /* Update vectorial force */
2429             fix2             = _mm_add_ps(fix2,tx);
2430             fiy2             = _mm_add_ps(fiy2,ty);
2431             fiz2             = _mm_add_ps(fiz2,tz);
2432
2433             fjx3             = _mm_add_ps(fjx3,tx);
2434             fjy3             = _mm_add_ps(fjy3,ty);
2435             fjz3             = _mm_add_ps(fjz3,tz);
2436             
2437             }
2438
2439             /**************************
2440              * CALCULATE INTERACTIONS *
2441              **************************/
2442
2443             if (gmx_mm_any_lt(rsq31,rcutoff2))
2444             {
2445
2446             r31              = _mm_mul_ps(rsq31,rinv31);
2447
2448             /* EWALD ELECTROSTATICS */
2449
2450             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2451             ewrt             = _mm_mul_ps(r31,ewtabscale);
2452             ewitab           = _mm_cvttps_epi32(ewrt);
2453             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2454             ewitab           = _mm_slli_epi32(ewitab,2);
2455             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2456             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2457             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2458             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2459             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2460             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2461             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2462             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
2463             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
2464
2465             d                = _mm_sub_ps(r31,rswitch);
2466             d                = _mm_max_ps(d,_mm_setzero_ps());
2467             d2               = _mm_mul_ps(d,d);
2468             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2469
2470             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2471
2472             /* Evaluate switch function */
2473             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2474             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
2475             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
2476
2477             fscal            = felec;
2478
2479             fscal            = _mm_and_ps(fscal,cutoff_mask);
2480
2481             /* Calculate temporary vectorial force */
2482             tx               = _mm_mul_ps(fscal,dx31);
2483             ty               = _mm_mul_ps(fscal,dy31);
2484             tz               = _mm_mul_ps(fscal,dz31);
2485
2486             /* Update vectorial force */
2487             fix3             = _mm_add_ps(fix3,tx);
2488             fiy3             = _mm_add_ps(fiy3,ty);
2489             fiz3             = _mm_add_ps(fiz3,tz);
2490
2491             fjx1             = _mm_add_ps(fjx1,tx);
2492             fjy1             = _mm_add_ps(fjy1,ty);
2493             fjz1             = _mm_add_ps(fjz1,tz);
2494             
2495             }
2496
2497             /**************************
2498              * CALCULATE INTERACTIONS *
2499              **************************/
2500
2501             if (gmx_mm_any_lt(rsq32,rcutoff2))
2502             {
2503
2504             r32              = _mm_mul_ps(rsq32,rinv32);
2505
2506             /* EWALD ELECTROSTATICS */
2507
2508             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2509             ewrt             = _mm_mul_ps(r32,ewtabscale);
2510             ewitab           = _mm_cvttps_epi32(ewrt);
2511             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2512             ewitab           = _mm_slli_epi32(ewitab,2);
2513             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2514             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2515             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2516             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2517             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2518             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2519             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2520             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
2521             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
2522
2523             d                = _mm_sub_ps(r32,rswitch);
2524             d                = _mm_max_ps(d,_mm_setzero_ps());
2525             d2               = _mm_mul_ps(d,d);
2526             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2527
2528             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2529
2530             /* Evaluate switch function */
2531             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2532             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
2533             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
2534
2535             fscal            = felec;
2536
2537             fscal            = _mm_and_ps(fscal,cutoff_mask);
2538
2539             /* Calculate temporary vectorial force */
2540             tx               = _mm_mul_ps(fscal,dx32);
2541             ty               = _mm_mul_ps(fscal,dy32);
2542             tz               = _mm_mul_ps(fscal,dz32);
2543
2544             /* Update vectorial force */
2545             fix3             = _mm_add_ps(fix3,tx);
2546             fiy3             = _mm_add_ps(fiy3,ty);
2547             fiz3             = _mm_add_ps(fiz3,tz);
2548
2549             fjx2             = _mm_add_ps(fjx2,tx);
2550             fjy2             = _mm_add_ps(fjy2,ty);
2551             fjz2             = _mm_add_ps(fjz2,tz);
2552             
2553             }
2554
2555             /**************************
2556              * CALCULATE INTERACTIONS *
2557              **************************/
2558
2559             if (gmx_mm_any_lt(rsq33,rcutoff2))
2560             {
2561
2562             r33              = _mm_mul_ps(rsq33,rinv33);
2563
2564             /* EWALD ELECTROSTATICS */
2565
2566             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2567             ewrt             = _mm_mul_ps(r33,ewtabscale);
2568             ewitab           = _mm_cvttps_epi32(ewrt);
2569             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2570             ewitab           = _mm_slli_epi32(ewitab,2);
2571             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2572             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2573             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2574             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2575             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2576             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2577             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2578             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
2579             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
2580
2581             d                = _mm_sub_ps(r33,rswitch);
2582             d                = _mm_max_ps(d,_mm_setzero_ps());
2583             d2               = _mm_mul_ps(d,d);
2584             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2585
2586             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2587
2588             /* Evaluate switch function */
2589             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2590             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
2591             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
2592
2593             fscal            = felec;
2594
2595             fscal            = _mm_and_ps(fscal,cutoff_mask);
2596
2597             /* Calculate temporary vectorial force */
2598             tx               = _mm_mul_ps(fscal,dx33);
2599             ty               = _mm_mul_ps(fscal,dy33);
2600             tz               = _mm_mul_ps(fscal,dz33);
2601
2602             /* Update vectorial force */
2603             fix3             = _mm_add_ps(fix3,tx);
2604             fiy3             = _mm_add_ps(fiy3,ty);
2605             fiz3             = _mm_add_ps(fiz3,tz);
2606
2607             fjx3             = _mm_add_ps(fjx3,tx);
2608             fjy3             = _mm_add_ps(fjy3,ty);
2609             fjz3             = _mm_add_ps(fjz3,tz);
2610             
2611             }
2612
2613             fjptrA             = f+j_coord_offsetA;
2614             fjptrB             = f+j_coord_offsetB;
2615             fjptrC             = f+j_coord_offsetC;
2616             fjptrD             = f+j_coord_offsetD;
2617
2618             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2619                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2620                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2621
2622             /* Inner loop uses 617 flops */
2623         }
2624
2625         if(jidx<j_index_end)
2626         {
2627
2628             /* Get j neighbor index, and coordinate index */
2629             jnrlistA         = jjnr[jidx];
2630             jnrlistB         = jjnr[jidx+1];
2631             jnrlistC         = jjnr[jidx+2];
2632             jnrlistD         = jjnr[jidx+3];
2633             /* Sign of each element will be negative for non-real atoms.
2634              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2635              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2636              */
2637             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2638             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2639             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2640             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2641             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2642             j_coord_offsetA  = DIM*jnrA;
2643             j_coord_offsetB  = DIM*jnrB;
2644             j_coord_offsetC  = DIM*jnrC;
2645             j_coord_offsetD  = DIM*jnrD;
2646
2647             /* load j atom coordinates */
2648             gmx_mm_load_4rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2649                                               x+j_coord_offsetC,x+j_coord_offsetD,
2650                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2651                                               &jy2,&jz2,&jx3,&jy3,&jz3);
2652
2653             /* Calculate displacement vector */
2654             dx00             = _mm_sub_ps(ix0,jx0);
2655             dy00             = _mm_sub_ps(iy0,jy0);
2656             dz00             = _mm_sub_ps(iz0,jz0);
2657             dx11             = _mm_sub_ps(ix1,jx1);
2658             dy11             = _mm_sub_ps(iy1,jy1);
2659             dz11             = _mm_sub_ps(iz1,jz1);
2660             dx12             = _mm_sub_ps(ix1,jx2);
2661             dy12             = _mm_sub_ps(iy1,jy2);
2662             dz12             = _mm_sub_ps(iz1,jz2);
2663             dx13             = _mm_sub_ps(ix1,jx3);
2664             dy13             = _mm_sub_ps(iy1,jy3);
2665             dz13             = _mm_sub_ps(iz1,jz3);
2666             dx21             = _mm_sub_ps(ix2,jx1);
2667             dy21             = _mm_sub_ps(iy2,jy1);
2668             dz21             = _mm_sub_ps(iz2,jz1);
2669             dx22             = _mm_sub_ps(ix2,jx2);
2670             dy22             = _mm_sub_ps(iy2,jy2);
2671             dz22             = _mm_sub_ps(iz2,jz2);
2672             dx23             = _mm_sub_ps(ix2,jx3);
2673             dy23             = _mm_sub_ps(iy2,jy3);
2674             dz23             = _mm_sub_ps(iz2,jz3);
2675             dx31             = _mm_sub_ps(ix3,jx1);
2676             dy31             = _mm_sub_ps(iy3,jy1);
2677             dz31             = _mm_sub_ps(iz3,jz1);
2678             dx32             = _mm_sub_ps(ix3,jx2);
2679             dy32             = _mm_sub_ps(iy3,jy2);
2680             dz32             = _mm_sub_ps(iz3,jz2);
2681             dx33             = _mm_sub_ps(ix3,jx3);
2682             dy33             = _mm_sub_ps(iy3,jy3);
2683             dz33             = _mm_sub_ps(iz3,jz3);
2684
2685             /* Calculate squared distance and things based on it */
2686             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2687             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2688             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2689             rsq13            = gmx_mm_calc_rsq_ps(dx13,dy13,dz13);
2690             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2691             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2692             rsq23            = gmx_mm_calc_rsq_ps(dx23,dy23,dz23);
2693             rsq31            = gmx_mm_calc_rsq_ps(dx31,dy31,dz31);
2694             rsq32            = gmx_mm_calc_rsq_ps(dx32,dy32,dz32);
2695             rsq33            = gmx_mm_calc_rsq_ps(dx33,dy33,dz33);
2696
2697             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2698             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2699             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2700             rinv13           = gmx_mm_invsqrt_ps(rsq13);
2701             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2702             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2703             rinv23           = gmx_mm_invsqrt_ps(rsq23);
2704             rinv31           = gmx_mm_invsqrt_ps(rsq31);
2705             rinv32           = gmx_mm_invsqrt_ps(rsq32);
2706             rinv33           = gmx_mm_invsqrt_ps(rsq33);
2707
2708             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2709             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2710             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2711             rinvsq13         = _mm_mul_ps(rinv13,rinv13);
2712             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2713             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2714             rinvsq23         = _mm_mul_ps(rinv23,rinv23);
2715             rinvsq31         = _mm_mul_ps(rinv31,rinv31);
2716             rinvsq32         = _mm_mul_ps(rinv32,rinv32);
2717             rinvsq33         = _mm_mul_ps(rinv33,rinv33);
2718
2719             fjx0             = _mm_setzero_ps();
2720             fjy0             = _mm_setzero_ps();
2721             fjz0             = _mm_setzero_ps();
2722             fjx1             = _mm_setzero_ps();
2723             fjy1             = _mm_setzero_ps();
2724             fjz1             = _mm_setzero_ps();
2725             fjx2             = _mm_setzero_ps();
2726             fjy2             = _mm_setzero_ps();
2727             fjz2             = _mm_setzero_ps();
2728             fjx3             = _mm_setzero_ps();
2729             fjy3             = _mm_setzero_ps();
2730             fjz3             = _mm_setzero_ps();
2731
2732             /**************************
2733              * CALCULATE INTERACTIONS *
2734              **************************/
2735
2736             if (gmx_mm_any_lt(rsq00,rcutoff2))
2737             {
2738
2739             r00              = _mm_mul_ps(rsq00,rinv00);
2740             r00              = _mm_andnot_ps(dummy_mask,r00);
2741
2742             /* LENNARD-JONES DISPERSION/REPULSION */
2743
2744             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2745             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
2746             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2747             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2748             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2749
2750             d                = _mm_sub_ps(r00,rswitch);
2751             d                = _mm_max_ps(d,_mm_setzero_ps());
2752             d2               = _mm_mul_ps(d,d);
2753             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2754
2755             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2756
2757             /* Evaluate switch function */
2758             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2759             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2760             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2761
2762             fscal            = fvdw;
2763
2764             fscal            = _mm_and_ps(fscal,cutoff_mask);
2765
2766             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2767
2768             /* Calculate temporary vectorial force */
2769             tx               = _mm_mul_ps(fscal,dx00);
2770             ty               = _mm_mul_ps(fscal,dy00);
2771             tz               = _mm_mul_ps(fscal,dz00);
2772
2773             /* Update vectorial force */
2774             fix0             = _mm_add_ps(fix0,tx);
2775             fiy0             = _mm_add_ps(fiy0,ty);
2776             fiz0             = _mm_add_ps(fiz0,tz);
2777
2778             fjx0             = _mm_add_ps(fjx0,tx);
2779             fjy0             = _mm_add_ps(fjy0,ty);
2780             fjz0             = _mm_add_ps(fjz0,tz);
2781             
2782             }
2783
2784             /**************************
2785              * CALCULATE INTERACTIONS *
2786              **************************/
2787
2788             if (gmx_mm_any_lt(rsq11,rcutoff2))
2789             {
2790
2791             r11              = _mm_mul_ps(rsq11,rinv11);
2792             r11              = _mm_andnot_ps(dummy_mask,r11);
2793
2794             /* EWALD ELECTROSTATICS */
2795
2796             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2797             ewrt             = _mm_mul_ps(r11,ewtabscale);
2798             ewitab           = _mm_cvttps_epi32(ewrt);
2799             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2800             ewitab           = _mm_slli_epi32(ewitab,2);
2801             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2802             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2803             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2804             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2805             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2806             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2807             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2808             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2809             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2810
2811             d                = _mm_sub_ps(r11,rswitch);
2812             d                = _mm_max_ps(d,_mm_setzero_ps());
2813             d2               = _mm_mul_ps(d,d);
2814             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2815
2816             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2817
2818             /* Evaluate switch function */
2819             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2820             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2821             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2822
2823             fscal            = felec;
2824
2825             fscal            = _mm_and_ps(fscal,cutoff_mask);
2826
2827             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2828
2829             /* Calculate temporary vectorial force */
2830             tx               = _mm_mul_ps(fscal,dx11);
2831             ty               = _mm_mul_ps(fscal,dy11);
2832             tz               = _mm_mul_ps(fscal,dz11);
2833
2834             /* Update vectorial force */
2835             fix1             = _mm_add_ps(fix1,tx);
2836             fiy1             = _mm_add_ps(fiy1,ty);
2837             fiz1             = _mm_add_ps(fiz1,tz);
2838
2839             fjx1             = _mm_add_ps(fjx1,tx);
2840             fjy1             = _mm_add_ps(fjy1,ty);
2841             fjz1             = _mm_add_ps(fjz1,tz);
2842             
2843             }
2844
2845             /**************************
2846              * CALCULATE INTERACTIONS *
2847              **************************/
2848
2849             if (gmx_mm_any_lt(rsq12,rcutoff2))
2850             {
2851
2852             r12              = _mm_mul_ps(rsq12,rinv12);
2853             r12              = _mm_andnot_ps(dummy_mask,r12);
2854
2855             /* EWALD ELECTROSTATICS */
2856
2857             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2858             ewrt             = _mm_mul_ps(r12,ewtabscale);
2859             ewitab           = _mm_cvttps_epi32(ewrt);
2860             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2861             ewitab           = _mm_slli_epi32(ewitab,2);
2862             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2863             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2864             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2865             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2866             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2867             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2868             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2869             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2870             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2871
2872             d                = _mm_sub_ps(r12,rswitch);
2873             d                = _mm_max_ps(d,_mm_setzero_ps());
2874             d2               = _mm_mul_ps(d,d);
2875             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2876
2877             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2878
2879             /* Evaluate switch function */
2880             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2881             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2882             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2883
2884             fscal            = felec;
2885
2886             fscal            = _mm_and_ps(fscal,cutoff_mask);
2887
2888             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2889
2890             /* Calculate temporary vectorial force */
2891             tx               = _mm_mul_ps(fscal,dx12);
2892             ty               = _mm_mul_ps(fscal,dy12);
2893             tz               = _mm_mul_ps(fscal,dz12);
2894
2895             /* Update vectorial force */
2896             fix1             = _mm_add_ps(fix1,tx);
2897             fiy1             = _mm_add_ps(fiy1,ty);
2898             fiz1             = _mm_add_ps(fiz1,tz);
2899
2900             fjx2             = _mm_add_ps(fjx2,tx);
2901             fjy2             = _mm_add_ps(fjy2,ty);
2902             fjz2             = _mm_add_ps(fjz2,tz);
2903             
2904             }
2905
2906             /**************************
2907              * CALCULATE INTERACTIONS *
2908              **************************/
2909
2910             if (gmx_mm_any_lt(rsq13,rcutoff2))
2911             {
2912
2913             r13              = _mm_mul_ps(rsq13,rinv13);
2914             r13              = _mm_andnot_ps(dummy_mask,r13);
2915
2916             /* EWALD ELECTROSTATICS */
2917
2918             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2919             ewrt             = _mm_mul_ps(r13,ewtabscale);
2920             ewitab           = _mm_cvttps_epi32(ewrt);
2921             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2922             ewitab           = _mm_slli_epi32(ewitab,2);
2923             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2924             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2925             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2926             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2927             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2928             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2929             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2930             velec            = _mm_mul_ps(qq13,_mm_sub_ps(rinv13,velec));
2931             felec            = _mm_mul_ps(_mm_mul_ps(qq13,rinv13),_mm_sub_ps(rinvsq13,felec));
2932
2933             d                = _mm_sub_ps(r13,rswitch);
2934             d                = _mm_max_ps(d,_mm_setzero_ps());
2935             d2               = _mm_mul_ps(d,d);
2936             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2937
2938             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2939
2940             /* Evaluate switch function */
2941             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2942             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv13,_mm_mul_ps(velec,dsw)) );
2943             cutoff_mask      = _mm_cmplt_ps(rsq13,rcutoff2);
2944
2945             fscal            = felec;
2946
2947             fscal            = _mm_and_ps(fscal,cutoff_mask);
2948
2949             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2950
2951             /* Calculate temporary vectorial force */
2952             tx               = _mm_mul_ps(fscal,dx13);
2953             ty               = _mm_mul_ps(fscal,dy13);
2954             tz               = _mm_mul_ps(fscal,dz13);
2955
2956             /* Update vectorial force */
2957             fix1             = _mm_add_ps(fix1,tx);
2958             fiy1             = _mm_add_ps(fiy1,ty);
2959             fiz1             = _mm_add_ps(fiz1,tz);
2960
2961             fjx3             = _mm_add_ps(fjx3,tx);
2962             fjy3             = _mm_add_ps(fjy3,ty);
2963             fjz3             = _mm_add_ps(fjz3,tz);
2964             
2965             }
2966
2967             /**************************
2968              * CALCULATE INTERACTIONS *
2969              **************************/
2970
2971             if (gmx_mm_any_lt(rsq21,rcutoff2))
2972             {
2973
2974             r21              = _mm_mul_ps(rsq21,rinv21);
2975             r21              = _mm_andnot_ps(dummy_mask,r21);
2976
2977             /* EWALD ELECTROSTATICS */
2978
2979             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2980             ewrt             = _mm_mul_ps(r21,ewtabscale);
2981             ewitab           = _mm_cvttps_epi32(ewrt);
2982             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2983             ewitab           = _mm_slli_epi32(ewitab,2);
2984             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2985             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2986             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2987             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2988             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2989             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2990             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2991             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2992             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2993
2994             d                = _mm_sub_ps(r21,rswitch);
2995             d                = _mm_max_ps(d,_mm_setzero_ps());
2996             d2               = _mm_mul_ps(d,d);
2997             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2998
2999             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3000
3001             /* Evaluate switch function */
3002             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3003             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
3004             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
3005
3006             fscal            = felec;
3007
3008             fscal            = _mm_and_ps(fscal,cutoff_mask);
3009
3010             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3011
3012             /* Calculate temporary vectorial force */
3013             tx               = _mm_mul_ps(fscal,dx21);
3014             ty               = _mm_mul_ps(fscal,dy21);
3015             tz               = _mm_mul_ps(fscal,dz21);
3016
3017             /* Update vectorial force */
3018             fix2             = _mm_add_ps(fix2,tx);
3019             fiy2             = _mm_add_ps(fiy2,ty);
3020             fiz2             = _mm_add_ps(fiz2,tz);
3021
3022             fjx1             = _mm_add_ps(fjx1,tx);
3023             fjy1             = _mm_add_ps(fjy1,ty);
3024             fjz1             = _mm_add_ps(fjz1,tz);
3025             
3026             }
3027
3028             /**************************
3029              * CALCULATE INTERACTIONS *
3030              **************************/
3031
3032             if (gmx_mm_any_lt(rsq22,rcutoff2))
3033             {
3034
3035             r22              = _mm_mul_ps(rsq22,rinv22);
3036             r22              = _mm_andnot_ps(dummy_mask,r22);
3037
3038             /* EWALD ELECTROSTATICS */
3039
3040             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3041             ewrt             = _mm_mul_ps(r22,ewtabscale);
3042             ewitab           = _mm_cvttps_epi32(ewrt);
3043             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3044             ewitab           = _mm_slli_epi32(ewitab,2);
3045             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3046             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3047             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3048             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3049             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3050             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3051             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3052             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3053             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3054
3055             d                = _mm_sub_ps(r22,rswitch);
3056             d                = _mm_max_ps(d,_mm_setzero_ps());
3057             d2               = _mm_mul_ps(d,d);
3058             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3059
3060             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3061
3062             /* Evaluate switch function */
3063             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3064             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3065             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
3066
3067             fscal            = felec;
3068
3069             fscal            = _mm_and_ps(fscal,cutoff_mask);
3070
3071             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3072
3073             /* Calculate temporary vectorial force */
3074             tx               = _mm_mul_ps(fscal,dx22);
3075             ty               = _mm_mul_ps(fscal,dy22);
3076             tz               = _mm_mul_ps(fscal,dz22);
3077
3078             /* Update vectorial force */
3079             fix2             = _mm_add_ps(fix2,tx);
3080             fiy2             = _mm_add_ps(fiy2,ty);
3081             fiz2             = _mm_add_ps(fiz2,tz);
3082
3083             fjx2             = _mm_add_ps(fjx2,tx);
3084             fjy2             = _mm_add_ps(fjy2,ty);
3085             fjz2             = _mm_add_ps(fjz2,tz);
3086             
3087             }
3088
3089             /**************************
3090              * CALCULATE INTERACTIONS *
3091              **************************/
3092
3093             if (gmx_mm_any_lt(rsq23,rcutoff2))
3094             {
3095
3096             r23              = _mm_mul_ps(rsq23,rinv23);
3097             r23              = _mm_andnot_ps(dummy_mask,r23);
3098
3099             /* EWALD ELECTROSTATICS */
3100
3101             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3102             ewrt             = _mm_mul_ps(r23,ewtabscale);
3103             ewitab           = _mm_cvttps_epi32(ewrt);
3104             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3105             ewitab           = _mm_slli_epi32(ewitab,2);
3106             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3107             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3108             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3109             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3110             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3111             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3112             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3113             velec            = _mm_mul_ps(qq23,_mm_sub_ps(rinv23,velec));
3114             felec            = _mm_mul_ps(_mm_mul_ps(qq23,rinv23),_mm_sub_ps(rinvsq23,felec));
3115
3116             d                = _mm_sub_ps(r23,rswitch);
3117             d                = _mm_max_ps(d,_mm_setzero_ps());
3118             d2               = _mm_mul_ps(d,d);
3119             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3120
3121             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3122
3123             /* Evaluate switch function */
3124             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3125             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv23,_mm_mul_ps(velec,dsw)) );
3126             cutoff_mask      = _mm_cmplt_ps(rsq23,rcutoff2);
3127
3128             fscal            = felec;
3129
3130             fscal            = _mm_and_ps(fscal,cutoff_mask);
3131
3132             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3133
3134             /* Calculate temporary vectorial force */
3135             tx               = _mm_mul_ps(fscal,dx23);
3136             ty               = _mm_mul_ps(fscal,dy23);
3137             tz               = _mm_mul_ps(fscal,dz23);
3138
3139             /* Update vectorial force */
3140             fix2             = _mm_add_ps(fix2,tx);
3141             fiy2             = _mm_add_ps(fiy2,ty);
3142             fiz2             = _mm_add_ps(fiz2,tz);
3143
3144             fjx3             = _mm_add_ps(fjx3,tx);
3145             fjy3             = _mm_add_ps(fjy3,ty);
3146             fjz3             = _mm_add_ps(fjz3,tz);
3147             
3148             }
3149
3150             /**************************
3151              * CALCULATE INTERACTIONS *
3152              **************************/
3153
3154             if (gmx_mm_any_lt(rsq31,rcutoff2))
3155             {
3156
3157             r31              = _mm_mul_ps(rsq31,rinv31);
3158             r31              = _mm_andnot_ps(dummy_mask,r31);
3159
3160             /* EWALD ELECTROSTATICS */
3161
3162             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3163             ewrt             = _mm_mul_ps(r31,ewtabscale);
3164             ewitab           = _mm_cvttps_epi32(ewrt);
3165             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3166             ewitab           = _mm_slli_epi32(ewitab,2);
3167             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3168             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3169             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3170             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3171             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3172             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3173             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3174             velec            = _mm_mul_ps(qq31,_mm_sub_ps(rinv31,velec));
3175             felec            = _mm_mul_ps(_mm_mul_ps(qq31,rinv31),_mm_sub_ps(rinvsq31,felec));
3176
3177             d                = _mm_sub_ps(r31,rswitch);
3178             d                = _mm_max_ps(d,_mm_setzero_ps());
3179             d2               = _mm_mul_ps(d,d);
3180             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3181
3182             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3183
3184             /* Evaluate switch function */
3185             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3186             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv31,_mm_mul_ps(velec,dsw)) );
3187             cutoff_mask      = _mm_cmplt_ps(rsq31,rcutoff2);
3188
3189             fscal            = felec;
3190
3191             fscal            = _mm_and_ps(fscal,cutoff_mask);
3192
3193             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3194
3195             /* Calculate temporary vectorial force */
3196             tx               = _mm_mul_ps(fscal,dx31);
3197             ty               = _mm_mul_ps(fscal,dy31);
3198             tz               = _mm_mul_ps(fscal,dz31);
3199
3200             /* Update vectorial force */
3201             fix3             = _mm_add_ps(fix3,tx);
3202             fiy3             = _mm_add_ps(fiy3,ty);
3203             fiz3             = _mm_add_ps(fiz3,tz);
3204
3205             fjx1             = _mm_add_ps(fjx1,tx);
3206             fjy1             = _mm_add_ps(fjy1,ty);
3207             fjz1             = _mm_add_ps(fjz1,tz);
3208             
3209             }
3210
3211             /**************************
3212              * CALCULATE INTERACTIONS *
3213              **************************/
3214
3215             if (gmx_mm_any_lt(rsq32,rcutoff2))
3216             {
3217
3218             r32              = _mm_mul_ps(rsq32,rinv32);
3219             r32              = _mm_andnot_ps(dummy_mask,r32);
3220
3221             /* EWALD ELECTROSTATICS */
3222
3223             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3224             ewrt             = _mm_mul_ps(r32,ewtabscale);
3225             ewitab           = _mm_cvttps_epi32(ewrt);
3226             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3227             ewitab           = _mm_slli_epi32(ewitab,2);
3228             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3229             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3230             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3231             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3232             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3233             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3234             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3235             velec            = _mm_mul_ps(qq32,_mm_sub_ps(rinv32,velec));
3236             felec            = _mm_mul_ps(_mm_mul_ps(qq32,rinv32),_mm_sub_ps(rinvsq32,felec));
3237
3238             d                = _mm_sub_ps(r32,rswitch);
3239             d                = _mm_max_ps(d,_mm_setzero_ps());
3240             d2               = _mm_mul_ps(d,d);
3241             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3242
3243             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3244
3245             /* Evaluate switch function */
3246             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3247             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv32,_mm_mul_ps(velec,dsw)) );
3248             cutoff_mask      = _mm_cmplt_ps(rsq32,rcutoff2);
3249
3250             fscal            = felec;
3251
3252             fscal            = _mm_and_ps(fscal,cutoff_mask);
3253
3254             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3255
3256             /* Calculate temporary vectorial force */
3257             tx               = _mm_mul_ps(fscal,dx32);
3258             ty               = _mm_mul_ps(fscal,dy32);
3259             tz               = _mm_mul_ps(fscal,dz32);
3260
3261             /* Update vectorial force */
3262             fix3             = _mm_add_ps(fix3,tx);
3263             fiy3             = _mm_add_ps(fiy3,ty);
3264             fiz3             = _mm_add_ps(fiz3,tz);
3265
3266             fjx2             = _mm_add_ps(fjx2,tx);
3267             fjy2             = _mm_add_ps(fjy2,ty);
3268             fjz2             = _mm_add_ps(fjz2,tz);
3269             
3270             }
3271
3272             /**************************
3273              * CALCULATE INTERACTIONS *
3274              **************************/
3275
3276             if (gmx_mm_any_lt(rsq33,rcutoff2))
3277             {
3278
3279             r33              = _mm_mul_ps(rsq33,rinv33);
3280             r33              = _mm_andnot_ps(dummy_mask,r33);
3281
3282             /* EWALD ELECTROSTATICS */
3283
3284             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3285             ewrt             = _mm_mul_ps(r33,ewtabscale);
3286             ewitab           = _mm_cvttps_epi32(ewrt);
3287             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3288             ewitab           = _mm_slli_epi32(ewitab,2);
3289             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3290             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3291             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3292             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3293             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3294             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3295             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3296             velec            = _mm_mul_ps(qq33,_mm_sub_ps(rinv33,velec));
3297             felec            = _mm_mul_ps(_mm_mul_ps(qq33,rinv33),_mm_sub_ps(rinvsq33,felec));
3298
3299             d                = _mm_sub_ps(r33,rswitch);
3300             d                = _mm_max_ps(d,_mm_setzero_ps());
3301             d2               = _mm_mul_ps(d,d);
3302             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3303
3304             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3305
3306             /* Evaluate switch function */
3307             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3308             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv33,_mm_mul_ps(velec,dsw)) );
3309             cutoff_mask      = _mm_cmplt_ps(rsq33,rcutoff2);
3310
3311             fscal            = felec;
3312
3313             fscal            = _mm_and_ps(fscal,cutoff_mask);
3314
3315             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3316
3317             /* Calculate temporary vectorial force */
3318             tx               = _mm_mul_ps(fscal,dx33);
3319             ty               = _mm_mul_ps(fscal,dy33);
3320             tz               = _mm_mul_ps(fscal,dz33);
3321
3322             /* Update vectorial force */
3323             fix3             = _mm_add_ps(fix3,tx);
3324             fiy3             = _mm_add_ps(fiy3,ty);
3325             fiz3             = _mm_add_ps(fiz3,tz);
3326
3327             fjx3             = _mm_add_ps(fjx3,tx);
3328             fjy3             = _mm_add_ps(fjy3,ty);
3329             fjz3             = _mm_add_ps(fjz3,tz);
3330             
3331             }
3332
3333             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3334             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3335             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3336             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3337
3338             gmx_mm_decrement_4rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3339                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3340                                                    fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3341
3342             /* Inner loop uses 627 flops */
3343         }
3344
3345         /* End of innermost loop */
3346
3347         gmx_mm_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3348                                               f+i_coord_offset,fshift+i_shift_offset);
3349
3350         /* Increment number of inner iterations */
3351         inneriter                  += j_index_end - j_index_start;
3352
3353         /* Outer loop uses 24 flops */
3354     }
3355
3356     /* Increment number of outer iterations */
3357     outeriter        += nri;
3358
3359     /* Update outer/inner flops */
3360
3361     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*627);
3362 }