Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_sse2_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_sse2_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
64     real             rcutoff_scalar;
65     real             *shiftvec,*fshift,*x,*f;
66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
67     real             scratch[4*DIM];
68     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
69     int              vdwioffset0;
70     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
71     int              vdwioffset1;
72     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
73     int              vdwioffset2;
74     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
91     real             *charge;
92     int              nvdwtype;
93     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
94     int              *vdwtype;
95     real             *vdwparam;
96     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
97     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
98     __m128i          ewitab;
99     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
100     real             *ewtab;
101     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
102     real             rswitch_scalar,d_scalar;
103     __m128           dummy_mask,cutoff_mask;
104     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
105     __m128           one     = _mm_set1_ps(1.0);
106     __m128           two     = _mm_set1_ps(2.0);
107     x                = xx[0];
108     f                = ff[0];
109
110     nri              = nlist->nri;
111     iinr             = nlist->iinr;
112     jindex           = nlist->jindex;
113     jjnr             = nlist->jjnr;
114     shiftidx         = nlist->shift;
115     gid              = nlist->gid;
116     shiftvec         = fr->shift_vec[0];
117     fshift           = fr->fshift[0];
118     facel            = _mm_set1_ps(fr->epsfac);
119     charge           = mdatoms->chargeA;
120     nvdwtype         = fr->ntype;
121     vdwparam         = fr->nbfp;
122     vdwtype          = mdatoms->typeA;
123
124     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
125     ewtab            = fr->ic->tabq_coul_FDV0;
126     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
127     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
128
129     /* Setup water-specific parameters */
130     inr              = nlist->iinr[0];
131     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
132     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
133     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
134     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
135
136     jq0              = _mm_set1_ps(charge[inr+0]);
137     jq1              = _mm_set1_ps(charge[inr+1]);
138     jq2              = _mm_set1_ps(charge[inr+2]);
139     vdwjidx0A        = 2*vdwtype[inr+0];
140     qq00             = _mm_mul_ps(iq0,jq0);
141     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
142     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
143     qq01             = _mm_mul_ps(iq0,jq1);
144     qq02             = _mm_mul_ps(iq0,jq2);
145     qq10             = _mm_mul_ps(iq1,jq0);
146     qq11             = _mm_mul_ps(iq1,jq1);
147     qq12             = _mm_mul_ps(iq1,jq2);
148     qq20             = _mm_mul_ps(iq2,jq0);
149     qq21             = _mm_mul_ps(iq2,jq1);
150     qq22             = _mm_mul_ps(iq2,jq2);
151
152     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
153     rcutoff_scalar   = fr->rcoulomb;
154     rcutoff          = _mm_set1_ps(rcutoff_scalar);
155     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
156
157     rswitch_scalar   = fr->rcoulomb_switch;
158     rswitch          = _mm_set1_ps(rswitch_scalar);
159     /* Setup switch parameters */
160     d_scalar         = rcutoff_scalar-rswitch_scalar;
161     d                = _mm_set1_ps(d_scalar);
162     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
163     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
164     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
165     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
166     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
167     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
168
169     /* Avoid stupid compiler warnings */
170     jnrA = jnrB = jnrC = jnrD = 0;
171     j_coord_offsetA = 0;
172     j_coord_offsetB = 0;
173     j_coord_offsetC = 0;
174     j_coord_offsetD = 0;
175
176     outeriter        = 0;
177     inneriter        = 0;
178
179     for(iidx=0;iidx<4*DIM;iidx++)
180     {
181         scratch[iidx] = 0.0;
182     }  
183
184     /* Start outer loop over neighborlists */
185     for(iidx=0; iidx<nri; iidx++)
186     {
187         /* Load shift vector for this list */
188         i_shift_offset   = DIM*shiftidx[iidx];
189
190         /* Load limits for loop over neighbors */
191         j_index_start    = jindex[iidx];
192         j_index_end      = jindex[iidx+1];
193
194         /* Get outer coordinate index */
195         inr              = iinr[iidx];
196         i_coord_offset   = DIM*inr;
197
198         /* Load i particle coords and add shift vector */
199         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
200                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
201         
202         fix0             = _mm_setzero_ps();
203         fiy0             = _mm_setzero_ps();
204         fiz0             = _mm_setzero_ps();
205         fix1             = _mm_setzero_ps();
206         fiy1             = _mm_setzero_ps();
207         fiz1             = _mm_setzero_ps();
208         fix2             = _mm_setzero_ps();
209         fiy2             = _mm_setzero_ps();
210         fiz2             = _mm_setzero_ps();
211
212         /* Reset potential sums */
213         velecsum         = _mm_setzero_ps();
214         vvdwsum          = _mm_setzero_ps();
215
216         /* Start inner kernel loop */
217         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
218         {
219
220             /* Get j neighbor index, and coordinate index */
221             jnrA             = jjnr[jidx];
222             jnrB             = jjnr[jidx+1];
223             jnrC             = jjnr[jidx+2];
224             jnrD             = jjnr[jidx+3];
225             j_coord_offsetA  = DIM*jnrA;
226             j_coord_offsetB  = DIM*jnrB;
227             j_coord_offsetC  = DIM*jnrC;
228             j_coord_offsetD  = DIM*jnrD;
229
230             /* load j atom coordinates */
231             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
232                                               x+j_coord_offsetC,x+j_coord_offsetD,
233                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
234
235             /* Calculate displacement vector */
236             dx00             = _mm_sub_ps(ix0,jx0);
237             dy00             = _mm_sub_ps(iy0,jy0);
238             dz00             = _mm_sub_ps(iz0,jz0);
239             dx01             = _mm_sub_ps(ix0,jx1);
240             dy01             = _mm_sub_ps(iy0,jy1);
241             dz01             = _mm_sub_ps(iz0,jz1);
242             dx02             = _mm_sub_ps(ix0,jx2);
243             dy02             = _mm_sub_ps(iy0,jy2);
244             dz02             = _mm_sub_ps(iz0,jz2);
245             dx10             = _mm_sub_ps(ix1,jx0);
246             dy10             = _mm_sub_ps(iy1,jy0);
247             dz10             = _mm_sub_ps(iz1,jz0);
248             dx11             = _mm_sub_ps(ix1,jx1);
249             dy11             = _mm_sub_ps(iy1,jy1);
250             dz11             = _mm_sub_ps(iz1,jz1);
251             dx12             = _mm_sub_ps(ix1,jx2);
252             dy12             = _mm_sub_ps(iy1,jy2);
253             dz12             = _mm_sub_ps(iz1,jz2);
254             dx20             = _mm_sub_ps(ix2,jx0);
255             dy20             = _mm_sub_ps(iy2,jy0);
256             dz20             = _mm_sub_ps(iz2,jz0);
257             dx21             = _mm_sub_ps(ix2,jx1);
258             dy21             = _mm_sub_ps(iy2,jy1);
259             dz21             = _mm_sub_ps(iz2,jz1);
260             dx22             = _mm_sub_ps(ix2,jx2);
261             dy22             = _mm_sub_ps(iy2,jy2);
262             dz22             = _mm_sub_ps(iz2,jz2);
263
264             /* Calculate squared distance and things based on it */
265             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
266             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
267             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
268             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
269             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
270             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
271             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
272             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
273             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
274
275             rinv00           = gmx_mm_invsqrt_ps(rsq00);
276             rinv01           = gmx_mm_invsqrt_ps(rsq01);
277             rinv02           = gmx_mm_invsqrt_ps(rsq02);
278             rinv10           = gmx_mm_invsqrt_ps(rsq10);
279             rinv11           = gmx_mm_invsqrt_ps(rsq11);
280             rinv12           = gmx_mm_invsqrt_ps(rsq12);
281             rinv20           = gmx_mm_invsqrt_ps(rsq20);
282             rinv21           = gmx_mm_invsqrt_ps(rsq21);
283             rinv22           = gmx_mm_invsqrt_ps(rsq22);
284
285             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
286             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
287             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
288             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
289             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
290             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
291             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
292             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
293             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
294
295             fjx0             = _mm_setzero_ps();
296             fjy0             = _mm_setzero_ps();
297             fjz0             = _mm_setzero_ps();
298             fjx1             = _mm_setzero_ps();
299             fjy1             = _mm_setzero_ps();
300             fjz1             = _mm_setzero_ps();
301             fjx2             = _mm_setzero_ps();
302             fjy2             = _mm_setzero_ps();
303             fjz2             = _mm_setzero_ps();
304
305             /**************************
306              * CALCULATE INTERACTIONS *
307              **************************/
308
309             if (gmx_mm_any_lt(rsq00,rcutoff2))
310             {
311
312             r00              = _mm_mul_ps(rsq00,rinv00);
313
314             /* EWALD ELECTROSTATICS */
315
316             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
317             ewrt             = _mm_mul_ps(r00,ewtabscale);
318             ewitab           = _mm_cvttps_epi32(ewrt);
319             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
320             ewitab           = _mm_slli_epi32(ewitab,2);
321             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
322             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
323             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
324             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
325             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
326             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
327             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
328             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
329             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
330
331             /* LENNARD-JONES DISPERSION/REPULSION */
332
333             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
334             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
335             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
336             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
337             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
338
339             d                = _mm_sub_ps(r00,rswitch);
340             d                = _mm_max_ps(d,_mm_setzero_ps());
341             d2               = _mm_mul_ps(d,d);
342             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
343
344             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
345
346             /* Evaluate switch function */
347             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
348             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
349             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
350             velec            = _mm_mul_ps(velec,sw);
351             vvdw             = _mm_mul_ps(vvdw,sw);
352             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
353
354             /* Update potential sum for this i atom from the interaction with this j atom. */
355             velec            = _mm_and_ps(velec,cutoff_mask);
356             velecsum         = _mm_add_ps(velecsum,velec);
357             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
358             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
359
360             fscal            = _mm_add_ps(felec,fvdw);
361
362             fscal            = _mm_and_ps(fscal,cutoff_mask);
363
364             /* Calculate temporary vectorial force */
365             tx               = _mm_mul_ps(fscal,dx00);
366             ty               = _mm_mul_ps(fscal,dy00);
367             tz               = _mm_mul_ps(fscal,dz00);
368
369             /* Update vectorial force */
370             fix0             = _mm_add_ps(fix0,tx);
371             fiy0             = _mm_add_ps(fiy0,ty);
372             fiz0             = _mm_add_ps(fiz0,tz);
373
374             fjx0             = _mm_add_ps(fjx0,tx);
375             fjy0             = _mm_add_ps(fjy0,ty);
376             fjz0             = _mm_add_ps(fjz0,tz);
377             
378             }
379
380             /**************************
381              * CALCULATE INTERACTIONS *
382              **************************/
383
384             if (gmx_mm_any_lt(rsq01,rcutoff2))
385             {
386
387             r01              = _mm_mul_ps(rsq01,rinv01);
388
389             /* EWALD ELECTROSTATICS */
390
391             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
392             ewrt             = _mm_mul_ps(r01,ewtabscale);
393             ewitab           = _mm_cvttps_epi32(ewrt);
394             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
395             ewitab           = _mm_slli_epi32(ewitab,2);
396             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
397             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
398             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
399             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
400             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
401             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
402             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
403             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
404             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
405
406             d                = _mm_sub_ps(r01,rswitch);
407             d                = _mm_max_ps(d,_mm_setzero_ps());
408             d2               = _mm_mul_ps(d,d);
409             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
410
411             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
412
413             /* Evaluate switch function */
414             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
415             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
416             velec            = _mm_mul_ps(velec,sw);
417             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
418
419             /* Update potential sum for this i atom from the interaction with this j atom. */
420             velec            = _mm_and_ps(velec,cutoff_mask);
421             velecsum         = _mm_add_ps(velecsum,velec);
422
423             fscal            = felec;
424
425             fscal            = _mm_and_ps(fscal,cutoff_mask);
426
427             /* Calculate temporary vectorial force */
428             tx               = _mm_mul_ps(fscal,dx01);
429             ty               = _mm_mul_ps(fscal,dy01);
430             tz               = _mm_mul_ps(fscal,dz01);
431
432             /* Update vectorial force */
433             fix0             = _mm_add_ps(fix0,tx);
434             fiy0             = _mm_add_ps(fiy0,ty);
435             fiz0             = _mm_add_ps(fiz0,tz);
436
437             fjx1             = _mm_add_ps(fjx1,tx);
438             fjy1             = _mm_add_ps(fjy1,ty);
439             fjz1             = _mm_add_ps(fjz1,tz);
440             
441             }
442
443             /**************************
444              * CALCULATE INTERACTIONS *
445              **************************/
446
447             if (gmx_mm_any_lt(rsq02,rcutoff2))
448             {
449
450             r02              = _mm_mul_ps(rsq02,rinv02);
451
452             /* EWALD ELECTROSTATICS */
453
454             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
455             ewrt             = _mm_mul_ps(r02,ewtabscale);
456             ewitab           = _mm_cvttps_epi32(ewrt);
457             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
458             ewitab           = _mm_slli_epi32(ewitab,2);
459             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
460             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
461             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
462             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
463             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
464             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
465             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
466             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
467             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
468
469             d                = _mm_sub_ps(r02,rswitch);
470             d                = _mm_max_ps(d,_mm_setzero_ps());
471             d2               = _mm_mul_ps(d,d);
472             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
473
474             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
475
476             /* Evaluate switch function */
477             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
478             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
479             velec            = _mm_mul_ps(velec,sw);
480             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
481
482             /* Update potential sum for this i atom from the interaction with this j atom. */
483             velec            = _mm_and_ps(velec,cutoff_mask);
484             velecsum         = _mm_add_ps(velecsum,velec);
485
486             fscal            = felec;
487
488             fscal            = _mm_and_ps(fscal,cutoff_mask);
489
490             /* Calculate temporary vectorial force */
491             tx               = _mm_mul_ps(fscal,dx02);
492             ty               = _mm_mul_ps(fscal,dy02);
493             tz               = _mm_mul_ps(fscal,dz02);
494
495             /* Update vectorial force */
496             fix0             = _mm_add_ps(fix0,tx);
497             fiy0             = _mm_add_ps(fiy0,ty);
498             fiz0             = _mm_add_ps(fiz0,tz);
499
500             fjx2             = _mm_add_ps(fjx2,tx);
501             fjy2             = _mm_add_ps(fjy2,ty);
502             fjz2             = _mm_add_ps(fjz2,tz);
503             
504             }
505
506             /**************************
507              * CALCULATE INTERACTIONS *
508              **************************/
509
510             if (gmx_mm_any_lt(rsq10,rcutoff2))
511             {
512
513             r10              = _mm_mul_ps(rsq10,rinv10);
514
515             /* EWALD ELECTROSTATICS */
516
517             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
518             ewrt             = _mm_mul_ps(r10,ewtabscale);
519             ewitab           = _mm_cvttps_epi32(ewrt);
520             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
521             ewitab           = _mm_slli_epi32(ewitab,2);
522             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
523             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
524             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
525             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
526             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
527             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
528             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
529             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
530             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
531
532             d                = _mm_sub_ps(r10,rswitch);
533             d                = _mm_max_ps(d,_mm_setzero_ps());
534             d2               = _mm_mul_ps(d,d);
535             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
536
537             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
538
539             /* Evaluate switch function */
540             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
541             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
542             velec            = _mm_mul_ps(velec,sw);
543             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
544
545             /* Update potential sum for this i atom from the interaction with this j atom. */
546             velec            = _mm_and_ps(velec,cutoff_mask);
547             velecsum         = _mm_add_ps(velecsum,velec);
548
549             fscal            = felec;
550
551             fscal            = _mm_and_ps(fscal,cutoff_mask);
552
553             /* Calculate temporary vectorial force */
554             tx               = _mm_mul_ps(fscal,dx10);
555             ty               = _mm_mul_ps(fscal,dy10);
556             tz               = _mm_mul_ps(fscal,dz10);
557
558             /* Update vectorial force */
559             fix1             = _mm_add_ps(fix1,tx);
560             fiy1             = _mm_add_ps(fiy1,ty);
561             fiz1             = _mm_add_ps(fiz1,tz);
562
563             fjx0             = _mm_add_ps(fjx0,tx);
564             fjy0             = _mm_add_ps(fjy0,ty);
565             fjz0             = _mm_add_ps(fjz0,tz);
566             
567             }
568
569             /**************************
570              * CALCULATE INTERACTIONS *
571              **************************/
572
573             if (gmx_mm_any_lt(rsq11,rcutoff2))
574             {
575
576             r11              = _mm_mul_ps(rsq11,rinv11);
577
578             /* EWALD ELECTROSTATICS */
579
580             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
581             ewrt             = _mm_mul_ps(r11,ewtabscale);
582             ewitab           = _mm_cvttps_epi32(ewrt);
583             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
584             ewitab           = _mm_slli_epi32(ewitab,2);
585             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
586             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
587             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
588             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
589             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
590             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
591             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
592             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
593             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
594
595             d                = _mm_sub_ps(r11,rswitch);
596             d                = _mm_max_ps(d,_mm_setzero_ps());
597             d2               = _mm_mul_ps(d,d);
598             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
599
600             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
601
602             /* Evaluate switch function */
603             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
604             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
605             velec            = _mm_mul_ps(velec,sw);
606             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
607
608             /* Update potential sum for this i atom from the interaction with this j atom. */
609             velec            = _mm_and_ps(velec,cutoff_mask);
610             velecsum         = _mm_add_ps(velecsum,velec);
611
612             fscal            = felec;
613
614             fscal            = _mm_and_ps(fscal,cutoff_mask);
615
616             /* Calculate temporary vectorial force */
617             tx               = _mm_mul_ps(fscal,dx11);
618             ty               = _mm_mul_ps(fscal,dy11);
619             tz               = _mm_mul_ps(fscal,dz11);
620
621             /* Update vectorial force */
622             fix1             = _mm_add_ps(fix1,tx);
623             fiy1             = _mm_add_ps(fiy1,ty);
624             fiz1             = _mm_add_ps(fiz1,tz);
625
626             fjx1             = _mm_add_ps(fjx1,tx);
627             fjy1             = _mm_add_ps(fjy1,ty);
628             fjz1             = _mm_add_ps(fjz1,tz);
629             
630             }
631
632             /**************************
633              * CALCULATE INTERACTIONS *
634              **************************/
635
636             if (gmx_mm_any_lt(rsq12,rcutoff2))
637             {
638
639             r12              = _mm_mul_ps(rsq12,rinv12);
640
641             /* EWALD ELECTROSTATICS */
642
643             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
644             ewrt             = _mm_mul_ps(r12,ewtabscale);
645             ewitab           = _mm_cvttps_epi32(ewrt);
646             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
647             ewitab           = _mm_slli_epi32(ewitab,2);
648             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
649             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
650             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
651             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
652             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
653             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
654             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
655             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
656             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
657
658             d                = _mm_sub_ps(r12,rswitch);
659             d                = _mm_max_ps(d,_mm_setzero_ps());
660             d2               = _mm_mul_ps(d,d);
661             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
662
663             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
664
665             /* Evaluate switch function */
666             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
667             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
668             velec            = _mm_mul_ps(velec,sw);
669             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
670
671             /* Update potential sum for this i atom from the interaction with this j atom. */
672             velec            = _mm_and_ps(velec,cutoff_mask);
673             velecsum         = _mm_add_ps(velecsum,velec);
674
675             fscal            = felec;
676
677             fscal            = _mm_and_ps(fscal,cutoff_mask);
678
679             /* Calculate temporary vectorial force */
680             tx               = _mm_mul_ps(fscal,dx12);
681             ty               = _mm_mul_ps(fscal,dy12);
682             tz               = _mm_mul_ps(fscal,dz12);
683
684             /* Update vectorial force */
685             fix1             = _mm_add_ps(fix1,tx);
686             fiy1             = _mm_add_ps(fiy1,ty);
687             fiz1             = _mm_add_ps(fiz1,tz);
688
689             fjx2             = _mm_add_ps(fjx2,tx);
690             fjy2             = _mm_add_ps(fjy2,ty);
691             fjz2             = _mm_add_ps(fjz2,tz);
692             
693             }
694
695             /**************************
696              * CALCULATE INTERACTIONS *
697              **************************/
698
699             if (gmx_mm_any_lt(rsq20,rcutoff2))
700             {
701
702             r20              = _mm_mul_ps(rsq20,rinv20);
703
704             /* EWALD ELECTROSTATICS */
705
706             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
707             ewrt             = _mm_mul_ps(r20,ewtabscale);
708             ewitab           = _mm_cvttps_epi32(ewrt);
709             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
710             ewitab           = _mm_slli_epi32(ewitab,2);
711             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
712             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
713             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
714             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
715             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
716             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
717             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
718             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
719             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
720
721             d                = _mm_sub_ps(r20,rswitch);
722             d                = _mm_max_ps(d,_mm_setzero_ps());
723             d2               = _mm_mul_ps(d,d);
724             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
725
726             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
727
728             /* Evaluate switch function */
729             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
730             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
731             velec            = _mm_mul_ps(velec,sw);
732             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
733
734             /* Update potential sum for this i atom from the interaction with this j atom. */
735             velec            = _mm_and_ps(velec,cutoff_mask);
736             velecsum         = _mm_add_ps(velecsum,velec);
737
738             fscal            = felec;
739
740             fscal            = _mm_and_ps(fscal,cutoff_mask);
741
742             /* Calculate temporary vectorial force */
743             tx               = _mm_mul_ps(fscal,dx20);
744             ty               = _mm_mul_ps(fscal,dy20);
745             tz               = _mm_mul_ps(fscal,dz20);
746
747             /* Update vectorial force */
748             fix2             = _mm_add_ps(fix2,tx);
749             fiy2             = _mm_add_ps(fiy2,ty);
750             fiz2             = _mm_add_ps(fiz2,tz);
751
752             fjx0             = _mm_add_ps(fjx0,tx);
753             fjy0             = _mm_add_ps(fjy0,ty);
754             fjz0             = _mm_add_ps(fjz0,tz);
755             
756             }
757
758             /**************************
759              * CALCULATE INTERACTIONS *
760              **************************/
761
762             if (gmx_mm_any_lt(rsq21,rcutoff2))
763             {
764
765             r21              = _mm_mul_ps(rsq21,rinv21);
766
767             /* EWALD ELECTROSTATICS */
768
769             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
770             ewrt             = _mm_mul_ps(r21,ewtabscale);
771             ewitab           = _mm_cvttps_epi32(ewrt);
772             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
773             ewitab           = _mm_slli_epi32(ewitab,2);
774             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
775             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
776             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
777             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
778             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
779             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
780             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
781             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
782             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
783
784             d                = _mm_sub_ps(r21,rswitch);
785             d                = _mm_max_ps(d,_mm_setzero_ps());
786             d2               = _mm_mul_ps(d,d);
787             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
788
789             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
790
791             /* Evaluate switch function */
792             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
793             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
794             velec            = _mm_mul_ps(velec,sw);
795             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
796
797             /* Update potential sum for this i atom from the interaction with this j atom. */
798             velec            = _mm_and_ps(velec,cutoff_mask);
799             velecsum         = _mm_add_ps(velecsum,velec);
800
801             fscal            = felec;
802
803             fscal            = _mm_and_ps(fscal,cutoff_mask);
804
805             /* Calculate temporary vectorial force */
806             tx               = _mm_mul_ps(fscal,dx21);
807             ty               = _mm_mul_ps(fscal,dy21);
808             tz               = _mm_mul_ps(fscal,dz21);
809
810             /* Update vectorial force */
811             fix2             = _mm_add_ps(fix2,tx);
812             fiy2             = _mm_add_ps(fiy2,ty);
813             fiz2             = _mm_add_ps(fiz2,tz);
814
815             fjx1             = _mm_add_ps(fjx1,tx);
816             fjy1             = _mm_add_ps(fjy1,ty);
817             fjz1             = _mm_add_ps(fjz1,tz);
818             
819             }
820
821             /**************************
822              * CALCULATE INTERACTIONS *
823              **************************/
824
825             if (gmx_mm_any_lt(rsq22,rcutoff2))
826             {
827
828             r22              = _mm_mul_ps(rsq22,rinv22);
829
830             /* EWALD ELECTROSTATICS */
831
832             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
833             ewrt             = _mm_mul_ps(r22,ewtabscale);
834             ewitab           = _mm_cvttps_epi32(ewrt);
835             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
836             ewitab           = _mm_slli_epi32(ewitab,2);
837             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
838             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
839             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
840             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
841             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
842             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
843             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
844             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
845             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
846
847             d                = _mm_sub_ps(r22,rswitch);
848             d                = _mm_max_ps(d,_mm_setzero_ps());
849             d2               = _mm_mul_ps(d,d);
850             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
851
852             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
853
854             /* Evaluate switch function */
855             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
856             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
857             velec            = _mm_mul_ps(velec,sw);
858             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
859
860             /* Update potential sum for this i atom from the interaction with this j atom. */
861             velec            = _mm_and_ps(velec,cutoff_mask);
862             velecsum         = _mm_add_ps(velecsum,velec);
863
864             fscal            = felec;
865
866             fscal            = _mm_and_ps(fscal,cutoff_mask);
867
868             /* Calculate temporary vectorial force */
869             tx               = _mm_mul_ps(fscal,dx22);
870             ty               = _mm_mul_ps(fscal,dy22);
871             tz               = _mm_mul_ps(fscal,dz22);
872
873             /* Update vectorial force */
874             fix2             = _mm_add_ps(fix2,tx);
875             fiy2             = _mm_add_ps(fiy2,ty);
876             fiz2             = _mm_add_ps(fiz2,tz);
877
878             fjx2             = _mm_add_ps(fjx2,tx);
879             fjy2             = _mm_add_ps(fjy2,ty);
880             fjz2             = _mm_add_ps(fjz2,tz);
881             
882             }
883
884             fjptrA             = f+j_coord_offsetA;
885             fjptrB             = f+j_coord_offsetB;
886             fjptrC             = f+j_coord_offsetC;
887             fjptrD             = f+j_coord_offsetD;
888
889             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
890                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
891
892             /* Inner loop uses 603 flops */
893         }
894
895         if(jidx<j_index_end)
896         {
897
898             /* Get j neighbor index, and coordinate index */
899             jnrlistA         = jjnr[jidx];
900             jnrlistB         = jjnr[jidx+1];
901             jnrlistC         = jjnr[jidx+2];
902             jnrlistD         = jjnr[jidx+3];
903             /* Sign of each element will be negative for non-real atoms.
904              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
905              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
906              */
907             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
908             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
909             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
910             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
911             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
912             j_coord_offsetA  = DIM*jnrA;
913             j_coord_offsetB  = DIM*jnrB;
914             j_coord_offsetC  = DIM*jnrC;
915             j_coord_offsetD  = DIM*jnrD;
916
917             /* load j atom coordinates */
918             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
919                                               x+j_coord_offsetC,x+j_coord_offsetD,
920                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
921
922             /* Calculate displacement vector */
923             dx00             = _mm_sub_ps(ix0,jx0);
924             dy00             = _mm_sub_ps(iy0,jy0);
925             dz00             = _mm_sub_ps(iz0,jz0);
926             dx01             = _mm_sub_ps(ix0,jx1);
927             dy01             = _mm_sub_ps(iy0,jy1);
928             dz01             = _mm_sub_ps(iz0,jz1);
929             dx02             = _mm_sub_ps(ix0,jx2);
930             dy02             = _mm_sub_ps(iy0,jy2);
931             dz02             = _mm_sub_ps(iz0,jz2);
932             dx10             = _mm_sub_ps(ix1,jx0);
933             dy10             = _mm_sub_ps(iy1,jy0);
934             dz10             = _mm_sub_ps(iz1,jz0);
935             dx11             = _mm_sub_ps(ix1,jx1);
936             dy11             = _mm_sub_ps(iy1,jy1);
937             dz11             = _mm_sub_ps(iz1,jz1);
938             dx12             = _mm_sub_ps(ix1,jx2);
939             dy12             = _mm_sub_ps(iy1,jy2);
940             dz12             = _mm_sub_ps(iz1,jz2);
941             dx20             = _mm_sub_ps(ix2,jx0);
942             dy20             = _mm_sub_ps(iy2,jy0);
943             dz20             = _mm_sub_ps(iz2,jz0);
944             dx21             = _mm_sub_ps(ix2,jx1);
945             dy21             = _mm_sub_ps(iy2,jy1);
946             dz21             = _mm_sub_ps(iz2,jz1);
947             dx22             = _mm_sub_ps(ix2,jx2);
948             dy22             = _mm_sub_ps(iy2,jy2);
949             dz22             = _mm_sub_ps(iz2,jz2);
950
951             /* Calculate squared distance and things based on it */
952             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
953             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
954             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
955             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
956             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
957             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
958             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
959             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
960             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
961
962             rinv00           = gmx_mm_invsqrt_ps(rsq00);
963             rinv01           = gmx_mm_invsqrt_ps(rsq01);
964             rinv02           = gmx_mm_invsqrt_ps(rsq02);
965             rinv10           = gmx_mm_invsqrt_ps(rsq10);
966             rinv11           = gmx_mm_invsqrt_ps(rsq11);
967             rinv12           = gmx_mm_invsqrt_ps(rsq12);
968             rinv20           = gmx_mm_invsqrt_ps(rsq20);
969             rinv21           = gmx_mm_invsqrt_ps(rsq21);
970             rinv22           = gmx_mm_invsqrt_ps(rsq22);
971
972             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
973             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
974             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
975             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
976             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
977             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
978             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
979             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
980             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
981
982             fjx0             = _mm_setzero_ps();
983             fjy0             = _mm_setzero_ps();
984             fjz0             = _mm_setzero_ps();
985             fjx1             = _mm_setzero_ps();
986             fjy1             = _mm_setzero_ps();
987             fjz1             = _mm_setzero_ps();
988             fjx2             = _mm_setzero_ps();
989             fjy2             = _mm_setzero_ps();
990             fjz2             = _mm_setzero_ps();
991
992             /**************************
993              * CALCULATE INTERACTIONS *
994              **************************/
995
996             if (gmx_mm_any_lt(rsq00,rcutoff2))
997             {
998
999             r00              = _mm_mul_ps(rsq00,rinv00);
1000             r00              = _mm_andnot_ps(dummy_mask,r00);
1001
1002             /* EWALD ELECTROSTATICS */
1003
1004             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1005             ewrt             = _mm_mul_ps(r00,ewtabscale);
1006             ewitab           = _mm_cvttps_epi32(ewrt);
1007             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1008             ewitab           = _mm_slli_epi32(ewitab,2);
1009             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1010             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1011             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1012             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1013             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1014             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1015             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1016             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1017             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1018
1019             /* LENNARD-JONES DISPERSION/REPULSION */
1020
1021             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1022             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1023             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1024             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1025             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1026
1027             d                = _mm_sub_ps(r00,rswitch);
1028             d                = _mm_max_ps(d,_mm_setzero_ps());
1029             d2               = _mm_mul_ps(d,d);
1030             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1031
1032             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1033
1034             /* Evaluate switch function */
1035             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1036             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1037             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1038             velec            = _mm_mul_ps(velec,sw);
1039             vvdw             = _mm_mul_ps(vvdw,sw);
1040             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1041
1042             /* Update potential sum for this i atom from the interaction with this j atom. */
1043             velec            = _mm_and_ps(velec,cutoff_mask);
1044             velec            = _mm_andnot_ps(dummy_mask,velec);
1045             velecsum         = _mm_add_ps(velecsum,velec);
1046             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
1047             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
1048             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
1049
1050             fscal            = _mm_add_ps(felec,fvdw);
1051
1052             fscal            = _mm_and_ps(fscal,cutoff_mask);
1053
1054             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1055
1056             /* Calculate temporary vectorial force */
1057             tx               = _mm_mul_ps(fscal,dx00);
1058             ty               = _mm_mul_ps(fscal,dy00);
1059             tz               = _mm_mul_ps(fscal,dz00);
1060
1061             /* Update vectorial force */
1062             fix0             = _mm_add_ps(fix0,tx);
1063             fiy0             = _mm_add_ps(fiy0,ty);
1064             fiz0             = _mm_add_ps(fiz0,tz);
1065
1066             fjx0             = _mm_add_ps(fjx0,tx);
1067             fjy0             = _mm_add_ps(fjy0,ty);
1068             fjz0             = _mm_add_ps(fjz0,tz);
1069             
1070             }
1071
1072             /**************************
1073              * CALCULATE INTERACTIONS *
1074              **************************/
1075
1076             if (gmx_mm_any_lt(rsq01,rcutoff2))
1077             {
1078
1079             r01              = _mm_mul_ps(rsq01,rinv01);
1080             r01              = _mm_andnot_ps(dummy_mask,r01);
1081
1082             /* EWALD ELECTROSTATICS */
1083
1084             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1085             ewrt             = _mm_mul_ps(r01,ewtabscale);
1086             ewitab           = _mm_cvttps_epi32(ewrt);
1087             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1088             ewitab           = _mm_slli_epi32(ewitab,2);
1089             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1090             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1091             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1092             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1093             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1094             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1095             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1096             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1097             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1098
1099             d                = _mm_sub_ps(r01,rswitch);
1100             d                = _mm_max_ps(d,_mm_setzero_ps());
1101             d2               = _mm_mul_ps(d,d);
1102             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1103
1104             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1105
1106             /* Evaluate switch function */
1107             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1108             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
1109             velec            = _mm_mul_ps(velec,sw);
1110             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1111
1112             /* Update potential sum for this i atom from the interaction with this j atom. */
1113             velec            = _mm_and_ps(velec,cutoff_mask);
1114             velec            = _mm_andnot_ps(dummy_mask,velec);
1115             velecsum         = _mm_add_ps(velecsum,velec);
1116
1117             fscal            = felec;
1118
1119             fscal            = _mm_and_ps(fscal,cutoff_mask);
1120
1121             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1122
1123             /* Calculate temporary vectorial force */
1124             tx               = _mm_mul_ps(fscal,dx01);
1125             ty               = _mm_mul_ps(fscal,dy01);
1126             tz               = _mm_mul_ps(fscal,dz01);
1127
1128             /* Update vectorial force */
1129             fix0             = _mm_add_ps(fix0,tx);
1130             fiy0             = _mm_add_ps(fiy0,ty);
1131             fiz0             = _mm_add_ps(fiz0,tz);
1132
1133             fjx1             = _mm_add_ps(fjx1,tx);
1134             fjy1             = _mm_add_ps(fjy1,ty);
1135             fjz1             = _mm_add_ps(fjz1,tz);
1136             
1137             }
1138
1139             /**************************
1140              * CALCULATE INTERACTIONS *
1141              **************************/
1142
1143             if (gmx_mm_any_lt(rsq02,rcutoff2))
1144             {
1145
1146             r02              = _mm_mul_ps(rsq02,rinv02);
1147             r02              = _mm_andnot_ps(dummy_mask,r02);
1148
1149             /* EWALD ELECTROSTATICS */
1150
1151             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1152             ewrt             = _mm_mul_ps(r02,ewtabscale);
1153             ewitab           = _mm_cvttps_epi32(ewrt);
1154             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1155             ewitab           = _mm_slli_epi32(ewitab,2);
1156             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1157             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1158             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1159             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1160             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1161             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1162             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1163             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
1164             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1165
1166             d                = _mm_sub_ps(r02,rswitch);
1167             d                = _mm_max_ps(d,_mm_setzero_ps());
1168             d2               = _mm_mul_ps(d,d);
1169             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1170
1171             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1172
1173             /* Evaluate switch function */
1174             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1175             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
1176             velec            = _mm_mul_ps(velec,sw);
1177             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1178
1179             /* Update potential sum for this i atom from the interaction with this j atom. */
1180             velec            = _mm_and_ps(velec,cutoff_mask);
1181             velec            = _mm_andnot_ps(dummy_mask,velec);
1182             velecsum         = _mm_add_ps(velecsum,velec);
1183
1184             fscal            = felec;
1185
1186             fscal            = _mm_and_ps(fscal,cutoff_mask);
1187
1188             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1189
1190             /* Calculate temporary vectorial force */
1191             tx               = _mm_mul_ps(fscal,dx02);
1192             ty               = _mm_mul_ps(fscal,dy02);
1193             tz               = _mm_mul_ps(fscal,dz02);
1194
1195             /* Update vectorial force */
1196             fix0             = _mm_add_ps(fix0,tx);
1197             fiy0             = _mm_add_ps(fiy0,ty);
1198             fiz0             = _mm_add_ps(fiz0,tz);
1199
1200             fjx2             = _mm_add_ps(fjx2,tx);
1201             fjy2             = _mm_add_ps(fjy2,ty);
1202             fjz2             = _mm_add_ps(fjz2,tz);
1203             
1204             }
1205
1206             /**************************
1207              * CALCULATE INTERACTIONS *
1208              **************************/
1209
1210             if (gmx_mm_any_lt(rsq10,rcutoff2))
1211             {
1212
1213             r10              = _mm_mul_ps(rsq10,rinv10);
1214             r10              = _mm_andnot_ps(dummy_mask,r10);
1215
1216             /* EWALD ELECTROSTATICS */
1217
1218             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1219             ewrt             = _mm_mul_ps(r10,ewtabscale);
1220             ewitab           = _mm_cvttps_epi32(ewrt);
1221             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1222             ewitab           = _mm_slli_epi32(ewitab,2);
1223             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1224             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1225             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1226             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1227             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1228             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1229             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1230             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1231             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1232
1233             d                = _mm_sub_ps(r10,rswitch);
1234             d                = _mm_max_ps(d,_mm_setzero_ps());
1235             d2               = _mm_mul_ps(d,d);
1236             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1237
1238             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1239
1240             /* Evaluate switch function */
1241             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1242             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
1243             velec            = _mm_mul_ps(velec,sw);
1244             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1245
1246             /* Update potential sum for this i atom from the interaction with this j atom. */
1247             velec            = _mm_and_ps(velec,cutoff_mask);
1248             velec            = _mm_andnot_ps(dummy_mask,velec);
1249             velecsum         = _mm_add_ps(velecsum,velec);
1250
1251             fscal            = felec;
1252
1253             fscal            = _mm_and_ps(fscal,cutoff_mask);
1254
1255             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1256
1257             /* Calculate temporary vectorial force */
1258             tx               = _mm_mul_ps(fscal,dx10);
1259             ty               = _mm_mul_ps(fscal,dy10);
1260             tz               = _mm_mul_ps(fscal,dz10);
1261
1262             /* Update vectorial force */
1263             fix1             = _mm_add_ps(fix1,tx);
1264             fiy1             = _mm_add_ps(fiy1,ty);
1265             fiz1             = _mm_add_ps(fiz1,tz);
1266
1267             fjx0             = _mm_add_ps(fjx0,tx);
1268             fjy0             = _mm_add_ps(fjy0,ty);
1269             fjz0             = _mm_add_ps(fjz0,tz);
1270             
1271             }
1272
1273             /**************************
1274              * CALCULATE INTERACTIONS *
1275              **************************/
1276
1277             if (gmx_mm_any_lt(rsq11,rcutoff2))
1278             {
1279
1280             r11              = _mm_mul_ps(rsq11,rinv11);
1281             r11              = _mm_andnot_ps(dummy_mask,r11);
1282
1283             /* EWALD ELECTROSTATICS */
1284
1285             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1286             ewrt             = _mm_mul_ps(r11,ewtabscale);
1287             ewitab           = _mm_cvttps_epi32(ewrt);
1288             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1289             ewitab           = _mm_slli_epi32(ewitab,2);
1290             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1291             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1292             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1293             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1294             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1295             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1296             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1297             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1298             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1299
1300             d                = _mm_sub_ps(r11,rswitch);
1301             d                = _mm_max_ps(d,_mm_setzero_ps());
1302             d2               = _mm_mul_ps(d,d);
1303             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1304
1305             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1306
1307             /* Evaluate switch function */
1308             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1309             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
1310             velec            = _mm_mul_ps(velec,sw);
1311             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1312
1313             /* Update potential sum for this i atom from the interaction with this j atom. */
1314             velec            = _mm_and_ps(velec,cutoff_mask);
1315             velec            = _mm_andnot_ps(dummy_mask,velec);
1316             velecsum         = _mm_add_ps(velecsum,velec);
1317
1318             fscal            = felec;
1319
1320             fscal            = _mm_and_ps(fscal,cutoff_mask);
1321
1322             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1323
1324             /* Calculate temporary vectorial force */
1325             tx               = _mm_mul_ps(fscal,dx11);
1326             ty               = _mm_mul_ps(fscal,dy11);
1327             tz               = _mm_mul_ps(fscal,dz11);
1328
1329             /* Update vectorial force */
1330             fix1             = _mm_add_ps(fix1,tx);
1331             fiy1             = _mm_add_ps(fiy1,ty);
1332             fiz1             = _mm_add_ps(fiz1,tz);
1333
1334             fjx1             = _mm_add_ps(fjx1,tx);
1335             fjy1             = _mm_add_ps(fjy1,ty);
1336             fjz1             = _mm_add_ps(fjz1,tz);
1337             
1338             }
1339
1340             /**************************
1341              * CALCULATE INTERACTIONS *
1342              **************************/
1343
1344             if (gmx_mm_any_lt(rsq12,rcutoff2))
1345             {
1346
1347             r12              = _mm_mul_ps(rsq12,rinv12);
1348             r12              = _mm_andnot_ps(dummy_mask,r12);
1349
1350             /* EWALD ELECTROSTATICS */
1351
1352             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1353             ewrt             = _mm_mul_ps(r12,ewtabscale);
1354             ewitab           = _mm_cvttps_epi32(ewrt);
1355             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1356             ewitab           = _mm_slli_epi32(ewitab,2);
1357             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1358             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1359             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1360             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1361             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1362             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1363             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1364             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1365             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1366
1367             d                = _mm_sub_ps(r12,rswitch);
1368             d                = _mm_max_ps(d,_mm_setzero_ps());
1369             d2               = _mm_mul_ps(d,d);
1370             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1371
1372             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1373
1374             /* Evaluate switch function */
1375             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1376             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
1377             velec            = _mm_mul_ps(velec,sw);
1378             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1379
1380             /* Update potential sum for this i atom from the interaction with this j atom. */
1381             velec            = _mm_and_ps(velec,cutoff_mask);
1382             velec            = _mm_andnot_ps(dummy_mask,velec);
1383             velecsum         = _mm_add_ps(velecsum,velec);
1384
1385             fscal            = felec;
1386
1387             fscal            = _mm_and_ps(fscal,cutoff_mask);
1388
1389             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1390
1391             /* Calculate temporary vectorial force */
1392             tx               = _mm_mul_ps(fscal,dx12);
1393             ty               = _mm_mul_ps(fscal,dy12);
1394             tz               = _mm_mul_ps(fscal,dz12);
1395
1396             /* Update vectorial force */
1397             fix1             = _mm_add_ps(fix1,tx);
1398             fiy1             = _mm_add_ps(fiy1,ty);
1399             fiz1             = _mm_add_ps(fiz1,tz);
1400
1401             fjx2             = _mm_add_ps(fjx2,tx);
1402             fjy2             = _mm_add_ps(fjy2,ty);
1403             fjz2             = _mm_add_ps(fjz2,tz);
1404             
1405             }
1406
1407             /**************************
1408              * CALCULATE INTERACTIONS *
1409              **************************/
1410
1411             if (gmx_mm_any_lt(rsq20,rcutoff2))
1412             {
1413
1414             r20              = _mm_mul_ps(rsq20,rinv20);
1415             r20              = _mm_andnot_ps(dummy_mask,r20);
1416
1417             /* EWALD ELECTROSTATICS */
1418
1419             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1420             ewrt             = _mm_mul_ps(r20,ewtabscale);
1421             ewitab           = _mm_cvttps_epi32(ewrt);
1422             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1423             ewitab           = _mm_slli_epi32(ewitab,2);
1424             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1425             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1426             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1427             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1428             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1429             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1430             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1431             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1432             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1433
1434             d                = _mm_sub_ps(r20,rswitch);
1435             d                = _mm_max_ps(d,_mm_setzero_ps());
1436             d2               = _mm_mul_ps(d,d);
1437             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1438
1439             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1440
1441             /* Evaluate switch function */
1442             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1443             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
1444             velec            = _mm_mul_ps(velec,sw);
1445             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1446
1447             /* Update potential sum for this i atom from the interaction with this j atom. */
1448             velec            = _mm_and_ps(velec,cutoff_mask);
1449             velec            = _mm_andnot_ps(dummy_mask,velec);
1450             velecsum         = _mm_add_ps(velecsum,velec);
1451
1452             fscal            = felec;
1453
1454             fscal            = _mm_and_ps(fscal,cutoff_mask);
1455
1456             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1457
1458             /* Calculate temporary vectorial force */
1459             tx               = _mm_mul_ps(fscal,dx20);
1460             ty               = _mm_mul_ps(fscal,dy20);
1461             tz               = _mm_mul_ps(fscal,dz20);
1462
1463             /* Update vectorial force */
1464             fix2             = _mm_add_ps(fix2,tx);
1465             fiy2             = _mm_add_ps(fiy2,ty);
1466             fiz2             = _mm_add_ps(fiz2,tz);
1467
1468             fjx0             = _mm_add_ps(fjx0,tx);
1469             fjy0             = _mm_add_ps(fjy0,ty);
1470             fjz0             = _mm_add_ps(fjz0,tz);
1471             
1472             }
1473
1474             /**************************
1475              * CALCULATE INTERACTIONS *
1476              **************************/
1477
1478             if (gmx_mm_any_lt(rsq21,rcutoff2))
1479             {
1480
1481             r21              = _mm_mul_ps(rsq21,rinv21);
1482             r21              = _mm_andnot_ps(dummy_mask,r21);
1483
1484             /* EWALD ELECTROSTATICS */
1485
1486             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1487             ewrt             = _mm_mul_ps(r21,ewtabscale);
1488             ewitab           = _mm_cvttps_epi32(ewrt);
1489             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1490             ewitab           = _mm_slli_epi32(ewitab,2);
1491             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1492             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1493             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1494             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1495             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1496             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1497             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1498             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1499             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1500
1501             d                = _mm_sub_ps(r21,rswitch);
1502             d                = _mm_max_ps(d,_mm_setzero_ps());
1503             d2               = _mm_mul_ps(d,d);
1504             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1505
1506             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1507
1508             /* Evaluate switch function */
1509             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1510             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
1511             velec            = _mm_mul_ps(velec,sw);
1512             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1513
1514             /* Update potential sum for this i atom from the interaction with this j atom. */
1515             velec            = _mm_and_ps(velec,cutoff_mask);
1516             velec            = _mm_andnot_ps(dummy_mask,velec);
1517             velecsum         = _mm_add_ps(velecsum,velec);
1518
1519             fscal            = felec;
1520
1521             fscal            = _mm_and_ps(fscal,cutoff_mask);
1522
1523             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1524
1525             /* Calculate temporary vectorial force */
1526             tx               = _mm_mul_ps(fscal,dx21);
1527             ty               = _mm_mul_ps(fscal,dy21);
1528             tz               = _mm_mul_ps(fscal,dz21);
1529
1530             /* Update vectorial force */
1531             fix2             = _mm_add_ps(fix2,tx);
1532             fiy2             = _mm_add_ps(fiy2,ty);
1533             fiz2             = _mm_add_ps(fiz2,tz);
1534
1535             fjx1             = _mm_add_ps(fjx1,tx);
1536             fjy1             = _mm_add_ps(fjy1,ty);
1537             fjz1             = _mm_add_ps(fjz1,tz);
1538             
1539             }
1540
1541             /**************************
1542              * CALCULATE INTERACTIONS *
1543              **************************/
1544
1545             if (gmx_mm_any_lt(rsq22,rcutoff2))
1546             {
1547
1548             r22              = _mm_mul_ps(rsq22,rinv22);
1549             r22              = _mm_andnot_ps(dummy_mask,r22);
1550
1551             /* EWALD ELECTROSTATICS */
1552
1553             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1554             ewrt             = _mm_mul_ps(r22,ewtabscale);
1555             ewitab           = _mm_cvttps_epi32(ewrt);
1556             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1557             ewitab           = _mm_slli_epi32(ewitab,2);
1558             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1559             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1560             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1561             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1562             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1563             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1564             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1565             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1566             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1567
1568             d                = _mm_sub_ps(r22,rswitch);
1569             d                = _mm_max_ps(d,_mm_setzero_ps());
1570             d2               = _mm_mul_ps(d,d);
1571             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1572
1573             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1574
1575             /* Evaluate switch function */
1576             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1577             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
1578             velec            = _mm_mul_ps(velec,sw);
1579             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1580
1581             /* Update potential sum for this i atom from the interaction with this j atom. */
1582             velec            = _mm_and_ps(velec,cutoff_mask);
1583             velec            = _mm_andnot_ps(dummy_mask,velec);
1584             velecsum         = _mm_add_ps(velecsum,velec);
1585
1586             fscal            = felec;
1587
1588             fscal            = _mm_and_ps(fscal,cutoff_mask);
1589
1590             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1591
1592             /* Calculate temporary vectorial force */
1593             tx               = _mm_mul_ps(fscal,dx22);
1594             ty               = _mm_mul_ps(fscal,dy22);
1595             tz               = _mm_mul_ps(fscal,dz22);
1596
1597             /* Update vectorial force */
1598             fix2             = _mm_add_ps(fix2,tx);
1599             fiy2             = _mm_add_ps(fiy2,ty);
1600             fiz2             = _mm_add_ps(fiz2,tz);
1601
1602             fjx2             = _mm_add_ps(fjx2,tx);
1603             fjy2             = _mm_add_ps(fjy2,ty);
1604             fjz2             = _mm_add_ps(fjz2,tz);
1605             
1606             }
1607
1608             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1609             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1610             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1611             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1612
1613             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1614                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1615
1616             /* Inner loop uses 612 flops */
1617         }
1618
1619         /* End of innermost loop */
1620
1621         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1622                                               f+i_coord_offset,fshift+i_shift_offset);
1623
1624         ggid                        = gid[iidx];
1625         /* Update potential energies */
1626         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1627         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1628
1629         /* Increment number of inner iterations */
1630         inneriter                  += j_index_end - j_index_start;
1631
1632         /* Outer loop uses 20 flops */
1633     }
1634
1635     /* Increment number of outer iterations */
1636     outeriter        += nri;
1637
1638     /* Update outer/inner flops */
1639
1640     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*612);
1641 }
1642 /*
1643  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single
1644  * Electrostatics interaction: Ewald
1645  * VdW interaction:            LennardJones
1646  * Geometry:                   Water3-Water3
1647  * Calculate force/pot:        Force
1648  */
1649 void
1650 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_sse2_single
1651                     (t_nblist * gmx_restrict                nlist,
1652                      rvec * gmx_restrict                    xx,
1653                      rvec * gmx_restrict                    ff,
1654                      t_forcerec * gmx_restrict              fr,
1655                      t_mdatoms * gmx_restrict               mdatoms,
1656                      nb_kernel_data_t * gmx_restrict        kernel_data,
1657                      t_nrnb * gmx_restrict                  nrnb)
1658 {
1659     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1660      * just 0 for non-waters.
1661      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1662      * jnr indices corresponding to data put in the four positions in the SIMD register.
1663      */
1664     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1665     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1666     int              jnrA,jnrB,jnrC,jnrD;
1667     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1668     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1669     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1670     real             rcutoff_scalar;
1671     real             *shiftvec,*fshift,*x,*f;
1672     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1673     real             scratch[4*DIM];
1674     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1675     int              vdwioffset0;
1676     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1677     int              vdwioffset1;
1678     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1679     int              vdwioffset2;
1680     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1681     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1682     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1683     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1684     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1685     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1686     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1687     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1688     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1689     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1690     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1691     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1692     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1693     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1694     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1695     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1696     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1697     real             *charge;
1698     int              nvdwtype;
1699     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1700     int              *vdwtype;
1701     real             *vdwparam;
1702     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1703     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1704     __m128i          ewitab;
1705     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1706     real             *ewtab;
1707     __m128           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1708     real             rswitch_scalar,d_scalar;
1709     __m128           dummy_mask,cutoff_mask;
1710     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1711     __m128           one     = _mm_set1_ps(1.0);
1712     __m128           two     = _mm_set1_ps(2.0);
1713     x                = xx[0];
1714     f                = ff[0];
1715
1716     nri              = nlist->nri;
1717     iinr             = nlist->iinr;
1718     jindex           = nlist->jindex;
1719     jjnr             = nlist->jjnr;
1720     shiftidx         = nlist->shift;
1721     gid              = nlist->gid;
1722     shiftvec         = fr->shift_vec[0];
1723     fshift           = fr->fshift[0];
1724     facel            = _mm_set1_ps(fr->epsfac);
1725     charge           = mdatoms->chargeA;
1726     nvdwtype         = fr->ntype;
1727     vdwparam         = fr->nbfp;
1728     vdwtype          = mdatoms->typeA;
1729
1730     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1731     ewtab            = fr->ic->tabq_coul_FDV0;
1732     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1733     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1734
1735     /* Setup water-specific parameters */
1736     inr              = nlist->iinr[0];
1737     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1738     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1739     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1740     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1741
1742     jq0              = _mm_set1_ps(charge[inr+0]);
1743     jq1              = _mm_set1_ps(charge[inr+1]);
1744     jq2              = _mm_set1_ps(charge[inr+2]);
1745     vdwjidx0A        = 2*vdwtype[inr+0];
1746     qq00             = _mm_mul_ps(iq0,jq0);
1747     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1748     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1749     qq01             = _mm_mul_ps(iq0,jq1);
1750     qq02             = _mm_mul_ps(iq0,jq2);
1751     qq10             = _mm_mul_ps(iq1,jq0);
1752     qq11             = _mm_mul_ps(iq1,jq1);
1753     qq12             = _mm_mul_ps(iq1,jq2);
1754     qq20             = _mm_mul_ps(iq2,jq0);
1755     qq21             = _mm_mul_ps(iq2,jq1);
1756     qq22             = _mm_mul_ps(iq2,jq2);
1757
1758     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1759     rcutoff_scalar   = fr->rcoulomb;
1760     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1761     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1762
1763     rswitch_scalar   = fr->rcoulomb_switch;
1764     rswitch          = _mm_set1_ps(rswitch_scalar);
1765     /* Setup switch parameters */
1766     d_scalar         = rcutoff_scalar-rswitch_scalar;
1767     d                = _mm_set1_ps(d_scalar);
1768     swV3             = _mm_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1769     swV4             = _mm_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1770     swV5             = _mm_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1771     swF2             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1772     swF3             = _mm_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1773     swF4             = _mm_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1774
1775     /* Avoid stupid compiler warnings */
1776     jnrA = jnrB = jnrC = jnrD = 0;
1777     j_coord_offsetA = 0;
1778     j_coord_offsetB = 0;
1779     j_coord_offsetC = 0;
1780     j_coord_offsetD = 0;
1781
1782     outeriter        = 0;
1783     inneriter        = 0;
1784
1785     for(iidx=0;iidx<4*DIM;iidx++)
1786     {
1787         scratch[iidx] = 0.0;
1788     }  
1789
1790     /* Start outer loop over neighborlists */
1791     for(iidx=0; iidx<nri; iidx++)
1792     {
1793         /* Load shift vector for this list */
1794         i_shift_offset   = DIM*shiftidx[iidx];
1795
1796         /* Load limits for loop over neighbors */
1797         j_index_start    = jindex[iidx];
1798         j_index_end      = jindex[iidx+1];
1799
1800         /* Get outer coordinate index */
1801         inr              = iinr[iidx];
1802         i_coord_offset   = DIM*inr;
1803
1804         /* Load i particle coords and add shift vector */
1805         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1806                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1807         
1808         fix0             = _mm_setzero_ps();
1809         fiy0             = _mm_setzero_ps();
1810         fiz0             = _mm_setzero_ps();
1811         fix1             = _mm_setzero_ps();
1812         fiy1             = _mm_setzero_ps();
1813         fiz1             = _mm_setzero_ps();
1814         fix2             = _mm_setzero_ps();
1815         fiy2             = _mm_setzero_ps();
1816         fiz2             = _mm_setzero_ps();
1817
1818         /* Start inner kernel loop */
1819         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1820         {
1821
1822             /* Get j neighbor index, and coordinate index */
1823             jnrA             = jjnr[jidx];
1824             jnrB             = jjnr[jidx+1];
1825             jnrC             = jjnr[jidx+2];
1826             jnrD             = jjnr[jidx+3];
1827             j_coord_offsetA  = DIM*jnrA;
1828             j_coord_offsetB  = DIM*jnrB;
1829             j_coord_offsetC  = DIM*jnrC;
1830             j_coord_offsetD  = DIM*jnrD;
1831
1832             /* load j atom coordinates */
1833             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1834                                               x+j_coord_offsetC,x+j_coord_offsetD,
1835                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1836
1837             /* Calculate displacement vector */
1838             dx00             = _mm_sub_ps(ix0,jx0);
1839             dy00             = _mm_sub_ps(iy0,jy0);
1840             dz00             = _mm_sub_ps(iz0,jz0);
1841             dx01             = _mm_sub_ps(ix0,jx1);
1842             dy01             = _mm_sub_ps(iy0,jy1);
1843             dz01             = _mm_sub_ps(iz0,jz1);
1844             dx02             = _mm_sub_ps(ix0,jx2);
1845             dy02             = _mm_sub_ps(iy0,jy2);
1846             dz02             = _mm_sub_ps(iz0,jz2);
1847             dx10             = _mm_sub_ps(ix1,jx0);
1848             dy10             = _mm_sub_ps(iy1,jy0);
1849             dz10             = _mm_sub_ps(iz1,jz0);
1850             dx11             = _mm_sub_ps(ix1,jx1);
1851             dy11             = _mm_sub_ps(iy1,jy1);
1852             dz11             = _mm_sub_ps(iz1,jz1);
1853             dx12             = _mm_sub_ps(ix1,jx2);
1854             dy12             = _mm_sub_ps(iy1,jy2);
1855             dz12             = _mm_sub_ps(iz1,jz2);
1856             dx20             = _mm_sub_ps(ix2,jx0);
1857             dy20             = _mm_sub_ps(iy2,jy0);
1858             dz20             = _mm_sub_ps(iz2,jz0);
1859             dx21             = _mm_sub_ps(ix2,jx1);
1860             dy21             = _mm_sub_ps(iy2,jy1);
1861             dz21             = _mm_sub_ps(iz2,jz1);
1862             dx22             = _mm_sub_ps(ix2,jx2);
1863             dy22             = _mm_sub_ps(iy2,jy2);
1864             dz22             = _mm_sub_ps(iz2,jz2);
1865
1866             /* Calculate squared distance and things based on it */
1867             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1868             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1869             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1870             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1871             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1872             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1873             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1874             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1875             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1876
1877             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1878             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1879             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1880             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1881             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1882             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1883             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1884             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1885             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1886
1887             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1888             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1889             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1890             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1891             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1892             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1893             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1894             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1895             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1896
1897             fjx0             = _mm_setzero_ps();
1898             fjy0             = _mm_setzero_ps();
1899             fjz0             = _mm_setzero_ps();
1900             fjx1             = _mm_setzero_ps();
1901             fjy1             = _mm_setzero_ps();
1902             fjz1             = _mm_setzero_ps();
1903             fjx2             = _mm_setzero_ps();
1904             fjy2             = _mm_setzero_ps();
1905             fjz2             = _mm_setzero_ps();
1906
1907             /**************************
1908              * CALCULATE INTERACTIONS *
1909              **************************/
1910
1911             if (gmx_mm_any_lt(rsq00,rcutoff2))
1912             {
1913
1914             r00              = _mm_mul_ps(rsq00,rinv00);
1915
1916             /* EWALD ELECTROSTATICS */
1917
1918             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1919             ewrt             = _mm_mul_ps(r00,ewtabscale);
1920             ewitab           = _mm_cvttps_epi32(ewrt);
1921             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1922             ewitab           = _mm_slli_epi32(ewitab,2);
1923             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1924             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1925             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1926             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1927             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1928             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1929             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1930             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
1931             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1932
1933             /* LENNARD-JONES DISPERSION/REPULSION */
1934
1935             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1936             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
1937             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
1938             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
1939             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
1940
1941             d                = _mm_sub_ps(r00,rswitch);
1942             d                = _mm_max_ps(d,_mm_setzero_ps());
1943             d2               = _mm_mul_ps(d,d);
1944             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
1945
1946             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
1947
1948             /* Evaluate switch function */
1949             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1950             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
1951             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
1952             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1953
1954             fscal            = _mm_add_ps(felec,fvdw);
1955
1956             fscal            = _mm_and_ps(fscal,cutoff_mask);
1957
1958             /* Calculate temporary vectorial force */
1959             tx               = _mm_mul_ps(fscal,dx00);
1960             ty               = _mm_mul_ps(fscal,dy00);
1961             tz               = _mm_mul_ps(fscal,dz00);
1962
1963             /* Update vectorial force */
1964             fix0             = _mm_add_ps(fix0,tx);
1965             fiy0             = _mm_add_ps(fiy0,ty);
1966             fiz0             = _mm_add_ps(fiz0,tz);
1967
1968             fjx0             = _mm_add_ps(fjx0,tx);
1969             fjy0             = _mm_add_ps(fjy0,ty);
1970             fjz0             = _mm_add_ps(fjz0,tz);
1971             
1972             }
1973
1974             /**************************
1975              * CALCULATE INTERACTIONS *
1976              **************************/
1977
1978             if (gmx_mm_any_lt(rsq01,rcutoff2))
1979             {
1980
1981             r01              = _mm_mul_ps(rsq01,rinv01);
1982
1983             /* EWALD ELECTROSTATICS */
1984
1985             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1986             ewrt             = _mm_mul_ps(r01,ewtabscale);
1987             ewitab           = _mm_cvttps_epi32(ewrt);
1988             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1989             ewitab           = _mm_slli_epi32(ewitab,2);
1990             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1991             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1992             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1993             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1994             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1995             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1996             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1997             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
1998             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1999
2000             d                = _mm_sub_ps(r01,rswitch);
2001             d                = _mm_max_ps(d,_mm_setzero_ps());
2002             d2               = _mm_mul_ps(d,d);
2003             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2004
2005             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2006
2007             /* Evaluate switch function */
2008             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2009             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2010             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
2011
2012             fscal            = felec;
2013
2014             fscal            = _mm_and_ps(fscal,cutoff_mask);
2015
2016             /* Calculate temporary vectorial force */
2017             tx               = _mm_mul_ps(fscal,dx01);
2018             ty               = _mm_mul_ps(fscal,dy01);
2019             tz               = _mm_mul_ps(fscal,dz01);
2020
2021             /* Update vectorial force */
2022             fix0             = _mm_add_ps(fix0,tx);
2023             fiy0             = _mm_add_ps(fiy0,ty);
2024             fiz0             = _mm_add_ps(fiz0,tz);
2025
2026             fjx1             = _mm_add_ps(fjx1,tx);
2027             fjy1             = _mm_add_ps(fjy1,ty);
2028             fjz1             = _mm_add_ps(fjz1,tz);
2029             
2030             }
2031
2032             /**************************
2033              * CALCULATE INTERACTIONS *
2034              **************************/
2035
2036             if (gmx_mm_any_lt(rsq02,rcutoff2))
2037             {
2038
2039             r02              = _mm_mul_ps(rsq02,rinv02);
2040
2041             /* EWALD ELECTROSTATICS */
2042
2043             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2044             ewrt             = _mm_mul_ps(r02,ewtabscale);
2045             ewitab           = _mm_cvttps_epi32(ewrt);
2046             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2047             ewitab           = _mm_slli_epi32(ewitab,2);
2048             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2049             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2050             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2051             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2052             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2053             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2054             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2055             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2056             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2057
2058             d                = _mm_sub_ps(r02,rswitch);
2059             d                = _mm_max_ps(d,_mm_setzero_ps());
2060             d2               = _mm_mul_ps(d,d);
2061             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2062
2063             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2064
2065             /* Evaluate switch function */
2066             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2067             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2068             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
2069
2070             fscal            = felec;
2071
2072             fscal            = _mm_and_ps(fscal,cutoff_mask);
2073
2074             /* Calculate temporary vectorial force */
2075             tx               = _mm_mul_ps(fscal,dx02);
2076             ty               = _mm_mul_ps(fscal,dy02);
2077             tz               = _mm_mul_ps(fscal,dz02);
2078
2079             /* Update vectorial force */
2080             fix0             = _mm_add_ps(fix0,tx);
2081             fiy0             = _mm_add_ps(fiy0,ty);
2082             fiz0             = _mm_add_ps(fiz0,tz);
2083
2084             fjx2             = _mm_add_ps(fjx2,tx);
2085             fjy2             = _mm_add_ps(fjy2,ty);
2086             fjz2             = _mm_add_ps(fjz2,tz);
2087             
2088             }
2089
2090             /**************************
2091              * CALCULATE INTERACTIONS *
2092              **************************/
2093
2094             if (gmx_mm_any_lt(rsq10,rcutoff2))
2095             {
2096
2097             r10              = _mm_mul_ps(rsq10,rinv10);
2098
2099             /* EWALD ELECTROSTATICS */
2100
2101             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2102             ewrt             = _mm_mul_ps(r10,ewtabscale);
2103             ewitab           = _mm_cvttps_epi32(ewrt);
2104             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2105             ewitab           = _mm_slli_epi32(ewitab,2);
2106             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2107             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2108             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2109             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2110             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2111             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2112             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2113             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2114             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2115
2116             d                = _mm_sub_ps(r10,rswitch);
2117             d                = _mm_max_ps(d,_mm_setzero_ps());
2118             d2               = _mm_mul_ps(d,d);
2119             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2120
2121             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2122
2123             /* Evaluate switch function */
2124             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2125             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2126             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
2127
2128             fscal            = felec;
2129
2130             fscal            = _mm_and_ps(fscal,cutoff_mask);
2131
2132             /* Calculate temporary vectorial force */
2133             tx               = _mm_mul_ps(fscal,dx10);
2134             ty               = _mm_mul_ps(fscal,dy10);
2135             tz               = _mm_mul_ps(fscal,dz10);
2136
2137             /* Update vectorial force */
2138             fix1             = _mm_add_ps(fix1,tx);
2139             fiy1             = _mm_add_ps(fiy1,ty);
2140             fiz1             = _mm_add_ps(fiz1,tz);
2141
2142             fjx0             = _mm_add_ps(fjx0,tx);
2143             fjy0             = _mm_add_ps(fjy0,ty);
2144             fjz0             = _mm_add_ps(fjz0,tz);
2145             
2146             }
2147
2148             /**************************
2149              * CALCULATE INTERACTIONS *
2150              **************************/
2151
2152             if (gmx_mm_any_lt(rsq11,rcutoff2))
2153             {
2154
2155             r11              = _mm_mul_ps(rsq11,rinv11);
2156
2157             /* EWALD ELECTROSTATICS */
2158
2159             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2160             ewrt             = _mm_mul_ps(r11,ewtabscale);
2161             ewitab           = _mm_cvttps_epi32(ewrt);
2162             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2163             ewitab           = _mm_slli_epi32(ewitab,2);
2164             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2165             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2166             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2167             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2168             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2169             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2170             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2171             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2172             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2173
2174             d                = _mm_sub_ps(r11,rswitch);
2175             d                = _mm_max_ps(d,_mm_setzero_ps());
2176             d2               = _mm_mul_ps(d,d);
2177             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2178
2179             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2180
2181             /* Evaluate switch function */
2182             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2183             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2184             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2185
2186             fscal            = felec;
2187
2188             fscal            = _mm_and_ps(fscal,cutoff_mask);
2189
2190             /* Calculate temporary vectorial force */
2191             tx               = _mm_mul_ps(fscal,dx11);
2192             ty               = _mm_mul_ps(fscal,dy11);
2193             tz               = _mm_mul_ps(fscal,dz11);
2194
2195             /* Update vectorial force */
2196             fix1             = _mm_add_ps(fix1,tx);
2197             fiy1             = _mm_add_ps(fiy1,ty);
2198             fiz1             = _mm_add_ps(fiz1,tz);
2199
2200             fjx1             = _mm_add_ps(fjx1,tx);
2201             fjy1             = _mm_add_ps(fjy1,ty);
2202             fjz1             = _mm_add_ps(fjz1,tz);
2203             
2204             }
2205
2206             /**************************
2207              * CALCULATE INTERACTIONS *
2208              **************************/
2209
2210             if (gmx_mm_any_lt(rsq12,rcutoff2))
2211             {
2212
2213             r12              = _mm_mul_ps(rsq12,rinv12);
2214
2215             /* EWALD ELECTROSTATICS */
2216
2217             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2218             ewrt             = _mm_mul_ps(r12,ewtabscale);
2219             ewitab           = _mm_cvttps_epi32(ewrt);
2220             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2221             ewitab           = _mm_slli_epi32(ewitab,2);
2222             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2223             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2224             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2225             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2226             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2227             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2228             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2229             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2230             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2231
2232             d                = _mm_sub_ps(r12,rswitch);
2233             d                = _mm_max_ps(d,_mm_setzero_ps());
2234             d2               = _mm_mul_ps(d,d);
2235             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2236
2237             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2238
2239             /* Evaluate switch function */
2240             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2241             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2242             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2243
2244             fscal            = felec;
2245
2246             fscal            = _mm_and_ps(fscal,cutoff_mask);
2247
2248             /* Calculate temporary vectorial force */
2249             tx               = _mm_mul_ps(fscal,dx12);
2250             ty               = _mm_mul_ps(fscal,dy12);
2251             tz               = _mm_mul_ps(fscal,dz12);
2252
2253             /* Update vectorial force */
2254             fix1             = _mm_add_ps(fix1,tx);
2255             fiy1             = _mm_add_ps(fiy1,ty);
2256             fiz1             = _mm_add_ps(fiz1,tz);
2257
2258             fjx2             = _mm_add_ps(fjx2,tx);
2259             fjy2             = _mm_add_ps(fjy2,ty);
2260             fjz2             = _mm_add_ps(fjz2,tz);
2261             
2262             }
2263
2264             /**************************
2265              * CALCULATE INTERACTIONS *
2266              **************************/
2267
2268             if (gmx_mm_any_lt(rsq20,rcutoff2))
2269             {
2270
2271             r20              = _mm_mul_ps(rsq20,rinv20);
2272
2273             /* EWALD ELECTROSTATICS */
2274
2275             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2276             ewrt             = _mm_mul_ps(r20,ewtabscale);
2277             ewitab           = _mm_cvttps_epi32(ewrt);
2278             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2279             ewitab           = _mm_slli_epi32(ewitab,2);
2280             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2281             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2282             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2283             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2284             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2285             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2286             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2287             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2288             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2289
2290             d                = _mm_sub_ps(r20,rswitch);
2291             d                = _mm_max_ps(d,_mm_setzero_ps());
2292             d2               = _mm_mul_ps(d,d);
2293             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2294
2295             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2296
2297             /* Evaluate switch function */
2298             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2299             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2300             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2301
2302             fscal            = felec;
2303
2304             fscal            = _mm_and_ps(fscal,cutoff_mask);
2305
2306             /* Calculate temporary vectorial force */
2307             tx               = _mm_mul_ps(fscal,dx20);
2308             ty               = _mm_mul_ps(fscal,dy20);
2309             tz               = _mm_mul_ps(fscal,dz20);
2310
2311             /* Update vectorial force */
2312             fix2             = _mm_add_ps(fix2,tx);
2313             fiy2             = _mm_add_ps(fiy2,ty);
2314             fiz2             = _mm_add_ps(fiz2,tz);
2315
2316             fjx0             = _mm_add_ps(fjx0,tx);
2317             fjy0             = _mm_add_ps(fjy0,ty);
2318             fjz0             = _mm_add_ps(fjz0,tz);
2319             
2320             }
2321
2322             /**************************
2323              * CALCULATE INTERACTIONS *
2324              **************************/
2325
2326             if (gmx_mm_any_lt(rsq21,rcutoff2))
2327             {
2328
2329             r21              = _mm_mul_ps(rsq21,rinv21);
2330
2331             /* EWALD ELECTROSTATICS */
2332
2333             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2334             ewrt             = _mm_mul_ps(r21,ewtabscale);
2335             ewitab           = _mm_cvttps_epi32(ewrt);
2336             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2337             ewitab           = _mm_slli_epi32(ewitab,2);
2338             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2339             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2340             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2341             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2342             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2343             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2344             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2345             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
2346             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2347
2348             d                = _mm_sub_ps(r21,rswitch);
2349             d                = _mm_max_ps(d,_mm_setzero_ps());
2350             d2               = _mm_mul_ps(d,d);
2351             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2352
2353             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2354
2355             /* Evaluate switch function */
2356             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2357             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
2358             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2359
2360             fscal            = felec;
2361
2362             fscal            = _mm_and_ps(fscal,cutoff_mask);
2363
2364             /* Calculate temporary vectorial force */
2365             tx               = _mm_mul_ps(fscal,dx21);
2366             ty               = _mm_mul_ps(fscal,dy21);
2367             tz               = _mm_mul_ps(fscal,dz21);
2368
2369             /* Update vectorial force */
2370             fix2             = _mm_add_ps(fix2,tx);
2371             fiy2             = _mm_add_ps(fiy2,ty);
2372             fiz2             = _mm_add_ps(fiz2,tz);
2373
2374             fjx1             = _mm_add_ps(fjx1,tx);
2375             fjy1             = _mm_add_ps(fjy1,ty);
2376             fjz1             = _mm_add_ps(fjz1,tz);
2377             
2378             }
2379
2380             /**************************
2381              * CALCULATE INTERACTIONS *
2382              **************************/
2383
2384             if (gmx_mm_any_lt(rsq22,rcutoff2))
2385             {
2386
2387             r22              = _mm_mul_ps(rsq22,rinv22);
2388
2389             /* EWALD ELECTROSTATICS */
2390
2391             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2392             ewrt             = _mm_mul_ps(r22,ewtabscale);
2393             ewitab           = _mm_cvttps_epi32(ewrt);
2394             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2395             ewitab           = _mm_slli_epi32(ewitab,2);
2396             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2397             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2398             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2399             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2400             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2401             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2402             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2403             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
2404             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2405
2406             d                = _mm_sub_ps(r22,rswitch);
2407             d                = _mm_max_ps(d,_mm_setzero_ps());
2408             d2               = _mm_mul_ps(d,d);
2409             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2410
2411             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2412
2413             /* Evaluate switch function */
2414             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2415             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
2416             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2417
2418             fscal            = felec;
2419
2420             fscal            = _mm_and_ps(fscal,cutoff_mask);
2421
2422             /* Calculate temporary vectorial force */
2423             tx               = _mm_mul_ps(fscal,dx22);
2424             ty               = _mm_mul_ps(fscal,dy22);
2425             tz               = _mm_mul_ps(fscal,dz22);
2426
2427             /* Update vectorial force */
2428             fix2             = _mm_add_ps(fix2,tx);
2429             fiy2             = _mm_add_ps(fiy2,ty);
2430             fiz2             = _mm_add_ps(fiz2,tz);
2431
2432             fjx2             = _mm_add_ps(fjx2,tx);
2433             fjy2             = _mm_add_ps(fjy2,ty);
2434             fjz2             = _mm_add_ps(fjz2,tz);
2435             
2436             }
2437
2438             fjptrA             = f+j_coord_offsetA;
2439             fjptrB             = f+j_coord_offsetB;
2440             fjptrC             = f+j_coord_offsetC;
2441             fjptrD             = f+j_coord_offsetD;
2442
2443             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2444                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2445
2446             /* Inner loop uses 573 flops */
2447         }
2448
2449         if(jidx<j_index_end)
2450         {
2451
2452             /* Get j neighbor index, and coordinate index */
2453             jnrlistA         = jjnr[jidx];
2454             jnrlistB         = jjnr[jidx+1];
2455             jnrlistC         = jjnr[jidx+2];
2456             jnrlistD         = jjnr[jidx+3];
2457             /* Sign of each element will be negative for non-real atoms.
2458              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2459              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2460              */
2461             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2462             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2463             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2464             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2465             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2466             j_coord_offsetA  = DIM*jnrA;
2467             j_coord_offsetB  = DIM*jnrB;
2468             j_coord_offsetC  = DIM*jnrC;
2469             j_coord_offsetD  = DIM*jnrD;
2470
2471             /* load j atom coordinates */
2472             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2473                                               x+j_coord_offsetC,x+j_coord_offsetD,
2474                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2475
2476             /* Calculate displacement vector */
2477             dx00             = _mm_sub_ps(ix0,jx0);
2478             dy00             = _mm_sub_ps(iy0,jy0);
2479             dz00             = _mm_sub_ps(iz0,jz0);
2480             dx01             = _mm_sub_ps(ix0,jx1);
2481             dy01             = _mm_sub_ps(iy0,jy1);
2482             dz01             = _mm_sub_ps(iz0,jz1);
2483             dx02             = _mm_sub_ps(ix0,jx2);
2484             dy02             = _mm_sub_ps(iy0,jy2);
2485             dz02             = _mm_sub_ps(iz0,jz2);
2486             dx10             = _mm_sub_ps(ix1,jx0);
2487             dy10             = _mm_sub_ps(iy1,jy0);
2488             dz10             = _mm_sub_ps(iz1,jz0);
2489             dx11             = _mm_sub_ps(ix1,jx1);
2490             dy11             = _mm_sub_ps(iy1,jy1);
2491             dz11             = _mm_sub_ps(iz1,jz1);
2492             dx12             = _mm_sub_ps(ix1,jx2);
2493             dy12             = _mm_sub_ps(iy1,jy2);
2494             dz12             = _mm_sub_ps(iz1,jz2);
2495             dx20             = _mm_sub_ps(ix2,jx0);
2496             dy20             = _mm_sub_ps(iy2,jy0);
2497             dz20             = _mm_sub_ps(iz2,jz0);
2498             dx21             = _mm_sub_ps(ix2,jx1);
2499             dy21             = _mm_sub_ps(iy2,jy1);
2500             dz21             = _mm_sub_ps(iz2,jz1);
2501             dx22             = _mm_sub_ps(ix2,jx2);
2502             dy22             = _mm_sub_ps(iy2,jy2);
2503             dz22             = _mm_sub_ps(iz2,jz2);
2504
2505             /* Calculate squared distance and things based on it */
2506             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2507             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2508             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2509             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2510             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2511             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2512             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2513             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2514             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2515
2516             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2517             rinv01           = gmx_mm_invsqrt_ps(rsq01);
2518             rinv02           = gmx_mm_invsqrt_ps(rsq02);
2519             rinv10           = gmx_mm_invsqrt_ps(rsq10);
2520             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2521             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2522             rinv20           = gmx_mm_invsqrt_ps(rsq20);
2523             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2524             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2525
2526             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2527             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
2528             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
2529             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
2530             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2531             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2532             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
2533             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2534             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2535
2536             fjx0             = _mm_setzero_ps();
2537             fjy0             = _mm_setzero_ps();
2538             fjz0             = _mm_setzero_ps();
2539             fjx1             = _mm_setzero_ps();
2540             fjy1             = _mm_setzero_ps();
2541             fjz1             = _mm_setzero_ps();
2542             fjx2             = _mm_setzero_ps();
2543             fjy2             = _mm_setzero_ps();
2544             fjz2             = _mm_setzero_ps();
2545
2546             /**************************
2547              * CALCULATE INTERACTIONS *
2548              **************************/
2549
2550             if (gmx_mm_any_lt(rsq00,rcutoff2))
2551             {
2552
2553             r00              = _mm_mul_ps(rsq00,rinv00);
2554             r00              = _mm_andnot_ps(dummy_mask,r00);
2555
2556             /* EWALD ELECTROSTATICS */
2557
2558             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2559             ewrt             = _mm_mul_ps(r00,ewtabscale);
2560             ewitab           = _mm_cvttps_epi32(ewrt);
2561             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2562             ewitab           = _mm_slli_epi32(ewitab,2);
2563             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2564             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2565             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2566             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2567             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2568             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2569             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2570             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
2571             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2572
2573             /* LENNARD-JONES DISPERSION/REPULSION */
2574
2575             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2576             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
2577             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
2578             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
2579             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
2580
2581             d                = _mm_sub_ps(r00,rswitch);
2582             d                = _mm_max_ps(d,_mm_setzero_ps());
2583             d2               = _mm_mul_ps(d,d);
2584             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2585
2586             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2587
2588             /* Evaluate switch function */
2589             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2590             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(velec,dsw)) );
2591             fvdw             = _mm_sub_ps( _mm_mul_ps(fvdw,sw) , _mm_mul_ps(rinv00,_mm_mul_ps(vvdw,dsw)) );
2592             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2593
2594             fscal            = _mm_add_ps(felec,fvdw);
2595
2596             fscal            = _mm_and_ps(fscal,cutoff_mask);
2597
2598             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2599
2600             /* Calculate temporary vectorial force */
2601             tx               = _mm_mul_ps(fscal,dx00);
2602             ty               = _mm_mul_ps(fscal,dy00);
2603             tz               = _mm_mul_ps(fscal,dz00);
2604
2605             /* Update vectorial force */
2606             fix0             = _mm_add_ps(fix0,tx);
2607             fiy0             = _mm_add_ps(fiy0,ty);
2608             fiz0             = _mm_add_ps(fiz0,tz);
2609
2610             fjx0             = _mm_add_ps(fjx0,tx);
2611             fjy0             = _mm_add_ps(fjy0,ty);
2612             fjz0             = _mm_add_ps(fjz0,tz);
2613             
2614             }
2615
2616             /**************************
2617              * CALCULATE INTERACTIONS *
2618              **************************/
2619
2620             if (gmx_mm_any_lt(rsq01,rcutoff2))
2621             {
2622
2623             r01              = _mm_mul_ps(rsq01,rinv01);
2624             r01              = _mm_andnot_ps(dummy_mask,r01);
2625
2626             /* EWALD ELECTROSTATICS */
2627
2628             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2629             ewrt             = _mm_mul_ps(r01,ewtabscale);
2630             ewitab           = _mm_cvttps_epi32(ewrt);
2631             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2632             ewitab           = _mm_slli_epi32(ewitab,2);
2633             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2634             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2635             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2636             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2637             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2638             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2639             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2640             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
2641             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2642
2643             d                = _mm_sub_ps(r01,rswitch);
2644             d                = _mm_max_ps(d,_mm_setzero_ps());
2645             d2               = _mm_mul_ps(d,d);
2646             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2647
2648             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2649
2650             /* Evaluate switch function */
2651             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2652             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv01,_mm_mul_ps(velec,dsw)) );
2653             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
2654
2655             fscal            = felec;
2656
2657             fscal            = _mm_and_ps(fscal,cutoff_mask);
2658
2659             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2660
2661             /* Calculate temporary vectorial force */
2662             tx               = _mm_mul_ps(fscal,dx01);
2663             ty               = _mm_mul_ps(fscal,dy01);
2664             tz               = _mm_mul_ps(fscal,dz01);
2665
2666             /* Update vectorial force */
2667             fix0             = _mm_add_ps(fix0,tx);
2668             fiy0             = _mm_add_ps(fiy0,ty);
2669             fiz0             = _mm_add_ps(fiz0,tz);
2670
2671             fjx1             = _mm_add_ps(fjx1,tx);
2672             fjy1             = _mm_add_ps(fjy1,ty);
2673             fjz1             = _mm_add_ps(fjz1,tz);
2674             
2675             }
2676
2677             /**************************
2678              * CALCULATE INTERACTIONS *
2679              **************************/
2680
2681             if (gmx_mm_any_lt(rsq02,rcutoff2))
2682             {
2683
2684             r02              = _mm_mul_ps(rsq02,rinv02);
2685             r02              = _mm_andnot_ps(dummy_mask,r02);
2686
2687             /* EWALD ELECTROSTATICS */
2688
2689             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2690             ewrt             = _mm_mul_ps(r02,ewtabscale);
2691             ewitab           = _mm_cvttps_epi32(ewrt);
2692             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2693             ewitab           = _mm_slli_epi32(ewitab,2);
2694             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2695             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2696             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2697             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2698             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2699             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2700             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2701             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
2702             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2703
2704             d                = _mm_sub_ps(r02,rswitch);
2705             d                = _mm_max_ps(d,_mm_setzero_ps());
2706             d2               = _mm_mul_ps(d,d);
2707             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2708
2709             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2710
2711             /* Evaluate switch function */
2712             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2713             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv02,_mm_mul_ps(velec,dsw)) );
2714             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
2715
2716             fscal            = felec;
2717
2718             fscal            = _mm_and_ps(fscal,cutoff_mask);
2719
2720             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2721
2722             /* Calculate temporary vectorial force */
2723             tx               = _mm_mul_ps(fscal,dx02);
2724             ty               = _mm_mul_ps(fscal,dy02);
2725             tz               = _mm_mul_ps(fscal,dz02);
2726
2727             /* Update vectorial force */
2728             fix0             = _mm_add_ps(fix0,tx);
2729             fiy0             = _mm_add_ps(fiy0,ty);
2730             fiz0             = _mm_add_ps(fiz0,tz);
2731
2732             fjx2             = _mm_add_ps(fjx2,tx);
2733             fjy2             = _mm_add_ps(fjy2,ty);
2734             fjz2             = _mm_add_ps(fjz2,tz);
2735             
2736             }
2737
2738             /**************************
2739              * CALCULATE INTERACTIONS *
2740              **************************/
2741
2742             if (gmx_mm_any_lt(rsq10,rcutoff2))
2743             {
2744
2745             r10              = _mm_mul_ps(rsq10,rinv10);
2746             r10              = _mm_andnot_ps(dummy_mask,r10);
2747
2748             /* EWALD ELECTROSTATICS */
2749
2750             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2751             ewrt             = _mm_mul_ps(r10,ewtabscale);
2752             ewitab           = _mm_cvttps_epi32(ewrt);
2753             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2754             ewitab           = _mm_slli_epi32(ewitab,2);
2755             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2756             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2757             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2758             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2759             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2760             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2761             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2762             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
2763             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2764
2765             d                = _mm_sub_ps(r10,rswitch);
2766             d                = _mm_max_ps(d,_mm_setzero_ps());
2767             d2               = _mm_mul_ps(d,d);
2768             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2769
2770             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2771
2772             /* Evaluate switch function */
2773             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2774             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv10,_mm_mul_ps(velec,dsw)) );
2775             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
2776
2777             fscal            = felec;
2778
2779             fscal            = _mm_and_ps(fscal,cutoff_mask);
2780
2781             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2782
2783             /* Calculate temporary vectorial force */
2784             tx               = _mm_mul_ps(fscal,dx10);
2785             ty               = _mm_mul_ps(fscal,dy10);
2786             tz               = _mm_mul_ps(fscal,dz10);
2787
2788             /* Update vectorial force */
2789             fix1             = _mm_add_ps(fix1,tx);
2790             fiy1             = _mm_add_ps(fiy1,ty);
2791             fiz1             = _mm_add_ps(fiz1,tz);
2792
2793             fjx0             = _mm_add_ps(fjx0,tx);
2794             fjy0             = _mm_add_ps(fjy0,ty);
2795             fjz0             = _mm_add_ps(fjz0,tz);
2796             
2797             }
2798
2799             /**************************
2800              * CALCULATE INTERACTIONS *
2801              **************************/
2802
2803             if (gmx_mm_any_lt(rsq11,rcutoff2))
2804             {
2805
2806             r11              = _mm_mul_ps(rsq11,rinv11);
2807             r11              = _mm_andnot_ps(dummy_mask,r11);
2808
2809             /* EWALD ELECTROSTATICS */
2810
2811             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2812             ewrt             = _mm_mul_ps(r11,ewtabscale);
2813             ewitab           = _mm_cvttps_epi32(ewrt);
2814             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2815             ewitab           = _mm_slli_epi32(ewitab,2);
2816             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2817             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2818             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2819             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2820             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2821             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2822             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2823             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
2824             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2825
2826             d                = _mm_sub_ps(r11,rswitch);
2827             d                = _mm_max_ps(d,_mm_setzero_ps());
2828             d2               = _mm_mul_ps(d,d);
2829             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2830
2831             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2832
2833             /* Evaluate switch function */
2834             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2835             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv11,_mm_mul_ps(velec,dsw)) );
2836             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2837
2838             fscal            = felec;
2839
2840             fscal            = _mm_and_ps(fscal,cutoff_mask);
2841
2842             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2843
2844             /* Calculate temporary vectorial force */
2845             tx               = _mm_mul_ps(fscal,dx11);
2846             ty               = _mm_mul_ps(fscal,dy11);
2847             tz               = _mm_mul_ps(fscal,dz11);
2848
2849             /* Update vectorial force */
2850             fix1             = _mm_add_ps(fix1,tx);
2851             fiy1             = _mm_add_ps(fiy1,ty);
2852             fiz1             = _mm_add_ps(fiz1,tz);
2853
2854             fjx1             = _mm_add_ps(fjx1,tx);
2855             fjy1             = _mm_add_ps(fjy1,ty);
2856             fjz1             = _mm_add_ps(fjz1,tz);
2857             
2858             }
2859
2860             /**************************
2861              * CALCULATE INTERACTIONS *
2862              **************************/
2863
2864             if (gmx_mm_any_lt(rsq12,rcutoff2))
2865             {
2866
2867             r12              = _mm_mul_ps(rsq12,rinv12);
2868             r12              = _mm_andnot_ps(dummy_mask,r12);
2869
2870             /* EWALD ELECTROSTATICS */
2871
2872             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2873             ewrt             = _mm_mul_ps(r12,ewtabscale);
2874             ewitab           = _mm_cvttps_epi32(ewrt);
2875             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2876             ewitab           = _mm_slli_epi32(ewitab,2);
2877             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2878             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2879             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2880             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2881             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2882             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2883             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2884             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
2885             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2886
2887             d                = _mm_sub_ps(r12,rswitch);
2888             d                = _mm_max_ps(d,_mm_setzero_ps());
2889             d2               = _mm_mul_ps(d,d);
2890             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2891
2892             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2893
2894             /* Evaluate switch function */
2895             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2896             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv12,_mm_mul_ps(velec,dsw)) );
2897             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2898
2899             fscal            = felec;
2900
2901             fscal            = _mm_and_ps(fscal,cutoff_mask);
2902
2903             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2904
2905             /* Calculate temporary vectorial force */
2906             tx               = _mm_mul_ps(fscal,dx12);
2907             ty               = _mm_mul_ps(fscal,dy12);
2908             tz               = _mm_mul_ps(fscal,dz12);
2909
2910             /* Update vectorial force */
2911             fix1             = _mm_add_ps(fix1,tx);
2912             fiy1             = _mm_add_ps(fiy1,ty);
2913             fiz1             = _mm_add_ps(fiz1,tz);
2914
2915             fjx2             = _mm_add_ps(fjx2,tx);
2916             fjy2             = _mm_add_ps(fjy2,ty);
2917             fjz2             = _mm_add_ps(fjz2,tz);
2918             
2919             }
2920
2921             /**************************
2922              * CALCULATE INTERACTIONS *
2923              **************************/
2924
2925             if (gmx_mm_any_lt(rsq20,rcutoff2))
2926             {
2927
2928             r20              = _mm_mul_ps(rsq20,rinv20);
2929             r20              = _mm_andnot_ps(dummy_mask,r20);
2930
2931             /* EWALD ELECTROSTATICS */
2932
2933             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2934             ewrt             = _mm_mul_ps(r20,ewtabscale);
2935             ewitab           = _mm_cvttps_epi32(ewrt);
2936             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2937             ewitab           = _mm_slli_epi32(ewitab,2);
2938             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
2939             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
2940             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
2941             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
2942             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
2943             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
2944             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
2945             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
2946             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2947
2948             d                = _mm_sub_ps(r20,rswitch);
2949             d                = _mm_max_ps(d,_mm_setzero_ps());
2950             d2               = _mm_mul_ps(d,d);
2951             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
2952
2953             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
2954
2955             /* Evaluate switch function */
2956             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2957             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv20,_mm_mul_ps(velec,dsw)) );
2958             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2959
2960             fscal            = felec;
2961
2962             fscal            = _mm_and_ps(fscal,cutoff_mask);
2963
2964             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2965
2966             /* Calculate temporary vectorial force */
2967             tx               = _mm_mul_ps(fscal,dx20);
2968             ty               = _mm_mul_ps(fscal,dy20);
2969             tz               = _mm_mul_ps(fscal,dz20);
2970
2971             /* Update vectorial force */
2972             fix2             = _mm_add_ps(fix2,tx);
2973             fiy2             = _mm_add_ps(fiy2,ty);
2974             fiz2             = _mm_add_ps(fiz2,tz);
2975
2976             fjx0             = _mm_add_ps(fjx0,tx);
2977             fjy0             = _mm_add_ps(fjy0,ty);
2978             fjz0             = _mm_add_ps(fjz0,tz);
2979             
2980             }
2981
2982             /**************************
2983              * CALCULATE INTERACTIONS *
2984              **************************/
2985
2986             if (gmx_mm_any_lt(rsq21,rcutoff2))
2987             {
2988
2989             r21              = _mm_mul_ps(rsq21,rinv21);
2990             r21              = _mm_andnot_ps(dummy_mask,r21);
2991
2992             /* EWALD ELECTROSTATICS */
2993
2994             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2995             ewrt             = _mm_mul_ps(r21,ewtabscale);
2996             ewitab           = _mm_cvttps_epi32(ewrt);
2997             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2998             ewitab           = _mm_slli_epi32(ewitab,2);
2999             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3000             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3001             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3002             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3003             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3004             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3005             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3006             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
3007             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
3008
3009             d                = _mm_sub_ps(r21,rswitch);
3010             d                = _mm_max_ps(d,_mm_setzero_ps());
3011             d2               = _mm_mul_ps(d,d);
3012             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3013
3014             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3015
3016             /* Evaluate switch function */
3017             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3018             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv21,_mm_mul_ps(velec,dsw)) );
3019             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
3020
3021             fscal            = felec;
3022
3023             fscal            = _mm_and_ps(fscal,cutoff_mask);
3024
3025             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3026
3027             /* Calculate temporary vectorial force */
3028             tx               = _mm_mul_ps(fscal,dx21);
3029             ty               = _mm_mul_ps(fscal,dy21);
3030             tz               = _mm_mul_ps(fscal,dz21);
3031
3032             /* Update vectorial force */
3033             fix2             = _mm_add_ps(fix2,tx);
3034             fiy2             = _mm_add_ps(fiy2,ty);
3035             fiz2             = _mm_add_ps(fiz2,tz);
3036
3037             fjx1             = _mm_add_ps(fjx1,tx);
3038             fjy1             = _mm_add_ps(fjy1,ty);
3039             fjz1             = _mm_add_ps(fjz1,tz);
3040             
3041             }
3042
3043             /**************************
3044              * CALCULATE INTERACTIONS *
3045              **************************/
3046
3047             if (gmx_mm_any_lt(rsq22,rcutoff2))
3048             {
3049
3050             r22              = _mm_mul_ps(rsq22,rinv22);
3051             r22              = _mm_andnot_ps(dummy_mask,r22);
3052
3053             /* EWALD ELECTROSTATICS */
3054
3055             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
3056             ewrt             = _mm_mul_ps(r22,ewtabscale);
3057             ewitab           = _mm_cvttps_epi32(ewrt);
3058             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
3059             ewitab           = _mm_slli_epi32(ewitab,2);
3060             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
3061             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
3062             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
3063             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
3064             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
3065             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
3066             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
3067             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
3068             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
3069
3070             d                = _mm_sub_ps(r22,rswitch);
3071             d                = _mm_max_ps(d,_mm_setzero_ps());
3072             d2               = _mm_mul_ps(d,d);
3073             sw               = _mm_add_ps(one,_mm_mul_ps(d2,_mm_mul_ps(d,_mm_add_ps(swV3,_mm_mul_ps(d,_mm_add_ps(swV4,_mm_mul_ps(d,swV5)))))));
3074
3075             dsw              = _mm_mul_ps(d2,_mm_add_ps(swF2,_mm_mul_ps(d,_mm_add_ps(swF3,_mm_mul_ps(d,swF4)))));
3076
3077             /* Evaluate switch function */
3078             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3079             felec            = _mm_sub_ps( _mm_mul_ps(felec,sw) , _mm_mul_ps(rinv22,_mm_mul_ps(velec,dsw)) );
3080             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
3081
3082             fscal            = felec;
3083
3084             fscal            = _mm_and_ps(fscal,cutoff_mask);
3085
3086             fscal            = _mm_andnot_ps(dummy_mask,fscal);
3087
3088             /* Calculate temporary vectorial force */
3089             tx               = _mm_mul_ps(fscal,dx22);
3090             ty               = _mm_mul_ps(fscal,dy22);
3091             tz               = _mm_mul_ps(fscal,dz22);
3092
3093             /* Update vectorial force */
3094             fix2             = _mm_add_ps(fix2,tx);
3095             fiy2             = _mm_add_ps(fiy2,ty);
3096             fiz2             = _mm_add_ps(fiz2,tz);
3097
3098             fjx2             = _mm_add_ps(fjx2,tx);
3099             fjy2             = _mm_add_ps(fjy2,ty);
3100             fjz2             = _mm_add_ps(fjz2,tz);
3101             
3102             }
3103
3104             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3105             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3106             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3107             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3108
3109             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
3110                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3111
3112             /* Inner loop uses 582 flops */
3113         }
3114
3115         /* End of innermost loop */
3116
3117         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3118                                               f+i_coord_offset,fshift+i_shift_offset);
3119
3120         /* Increment number of inner iterations */
3121         inneriter                  += j_index_end - j_index_start;
3122
3123         /* Outer loop uses 18 flops */
3124     }
3125
3126     /* Increment number of outer iterations */
3127     outeriter        += nri;
3128
3129     /* Update outer/inner flops */
3130
3131     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*582);
3132 }