7f82e4e639d383401dadf1bf4019b914adeb21bc
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_sse2_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_sse2_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
64     real             rcutoff_scalar;
65     real             *shiftvec,*fshift,*x,*f;
66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
67     real             scratch[4*DIM];
68     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
69     int              vdwioffset0;
70     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
71     int              vdwioffset1;
72     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
73     int              vdwioffset2;
74     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
91     real             *charge;
92     int              nvdwtype;
93     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
94     int              *vdwtype;
95     real             *vdwparam;
96     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
97     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
98     __m128i          ewitab;
99     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
100     real             *ewtab;
101     __m128           dummy_mask,cutoff_mask;
102     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
103     __m128           one     = _mm_set1_ps(1.0);
104     __m128           two     = _mm_set1_ps(2.0);
105     x                = xx[0];
106     f                = ff[0];
107
108     nri              = nlist->nri;
109     iinr             = nlist->iinr;
110     jindex           = nlist->jindex;
111     jjnr             = nlist->jjnr;
112     shiftidx         = nlist->shift;
113     gid              = nlist->gid;
114     shiftvec         = fr->shift_vec[0];
115     fshift           = fr->fshift[0];
116     facel            = _mm_set1_ps(fr->epsfac);
117     charge           = mdatoms->chargeA;
118     nvdwtype         = fr->ntype;
119     vdwparam         = fr->nbfp;
120     vdwtype          = mdatoms->typeA;
121
122     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
123     ewtab            = fr->ic->tabq_coul_FDV0;
124     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
125     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
126
127     /* Setup water-specific parameters */
128     inr              = nlist->iinr[0];
129     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
130     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
131     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
132     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
133
134     jq0              = _mm_set1_ps(charge[inr+0]);
135     jq1              = _mm_set1_ps(charge[inr+1]);
136     jq2              = _mm_set1_ps(charge[inr+2]);
137     vdwjidx0A        = 2*vdwtype[inr+0];
138     qq00             = _mm_mul_ps(iq0,jq0);
139     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
140     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
141     qq01             = _mm_mul_ps(iq0,jq1);
142     qq02             = _mm_mul_ps(iq0,jq2);
143     qq10             = _mm_mul_ps(iq1,jq0);
144     qq11             = _mm_mul_ps(iq1,jq1);
145     qq12             = _mm_mul_ps(iq1,jq2);
146     qq20             = _mm_mul_ps(iq2,jq0);
147     qq21             = _mm_mul_ps(iq2,jq1);
148     qq22             = _mm_mul_ps(iq2,jq2);
149
150     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
151     rcutoff_scalar   = fr->rcoulomb;
152     rcutoff          = _mm_set1_ps(rcutoff_scalar);
153     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
154
155     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
156     rvdw             = _mm_set1_ps(fr->rvdw);
157
158     /* Avoid stupid compiler warnings */
159     jnrA = jnrB = jnrC = jnrD = 0;
160     j_coord_offsetA = 0;
161     j_coord_offsetB = 0;
162     j_coord_offsetC = 0;
163     j_coord_offsetD = 0;
164
165     outeriter        = 0;
166     inneriter        = 0;
167
168     for(iidx=0;iidx<4*DIM;iidx++)
169     {
170         scratch[iidx] = 0.0;
171     }  
172
173     /* Start outer loop over neighborlists */
174     for(iidx=0; iidx<nri; iidx++)
175     {
176         /* Load shift vector for this list */
177         i_shift_offset   = DIM*shiftidx[iidx];
178
179         /* Load limits for loop over neighbors */
180         j_index_start    = jindex[iidx];
181         j_index_end      = jindex[iidx+1];
182
183         /* Get outer coordinate index */
184         inr              = iinr[iidx];
185         i_coord_offset   = DIM*inr;
186
187         /* Load i particle coords and add shift vector */
188         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
189                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
190         
191         fix0             = _mm_setzero_ps();
192         fiy0             = _mm_setzero_ps();
193         fiz0             = _mm_setzero_ps();
194         fix1             = _mm_setzero_ps();
195         fiy1             = _mm_setzero_ps();
196         fiz1             = _mm_setzero_ps();
197         fix2             = _mm_setzero_ps();
198         fiy2             = _mm_setzero_ps();
199         fiz2             = _mm_setzero_ps();
200
201         /* Reset potential sums */
202         velecsum         = _mm_setzero_ps();
203         vvdwsum          = _mm_setzero_ps();
204
205         /* Start inner kernel loop */
206         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
207         {
208
209             /* Get j neighbor index, and coordinate index */
210             jnrA             = jjnr[jidx];
211             jnrB             = jjnr[jidx+1];
212             jnrC             = jjnr[jidx+2];
213             jnrD             = jjnr[jidx+3];
214             j_coord_offsetA  = DIM*jnrA;
215             j_coord_offsetB  = DIM*jnrB;
216             j_coord_offsetC  = DIM*jnrC;
217             j_coord_offsetD  = DIM*jnrD;
218
219             /* load j atom coordinates */
220             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
221                                               x+j_coord_offsetC,x+j_coord_offsetD,
222                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
223
224             /* Calculate displacement vector */
225             dx00             = _mm_sub_ps(ix0,jx0);
226             dy00             = _mm_sub_ps(iy0,jy0);
227             dz00             = _mm_sub_ps(iz0,jz0);
228             dx01             = _mm_sub_ps(ix0,jx1);
229             dy01             = _mm_sub_ps(iy0,jy1);
230             dz01             = _mm_sub_ps(iz0,jz1);
231             dx02             = _mm_sub_ps(ix0,jx2);
232             dy02             = _mm_sub_ps(iy0,jy2);
233             dz02             = _mm_sub_ps(iz0,jz2);
234             dx10             = _mm_sub_ps(ix1,jx0);
235             dy10             = _mm_sub_ps(iy1,jy0);
236             dz10             = _mm_sub_ps(iz1,jz0);
237             dx11             = _mm_sub_ps(ix1,jx1);
238             dy11             = _mm_sub_ps(iy1,jy1);
239             dz11             = _mm_sub_ps(iz1,jz1);
240             dx12             = _mm_sub_ps(ix1,jx2);
241             dy12             = _mm_sub_ps(iy1,jy2);
242             dz12             = _mm_sub_ps(iz1,jz2);
243             dx20             = _mm_sub_ps(ix2,jx0);
244             dy20             = _mm_sub_ps(iy2,jy0);
245             dz20             = _mm_sub_ps(iz2,jz0);
246             dx21             = _mm_sub_ps(ix2,jx1);
247             dy21             = _mm_sub_ps(iy2,jy1);
248             dz21             = _mm_sub_ps(iz2,jz1);
249             dx22             = _mm_sub_ps(ix2,jx2);
250             dy22             = _mm_sub_ps(iy2,jy2);
251             dz22             = _mm_sub_ps(iz2,jz2);
252
253             /* Calculate squared distance and things based on it */
254             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
255             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
256             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
257             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
258             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
259             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
260             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
261             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
262             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
263
264             rinv00           = gmx_mm_invsqrt_ps(rsq00);
265             rinv01           = gmx_mm_invsqrt_ps(rsq01);
266             rinv02           = gmx_mm_invsqrt_ps(rsq02);
267             rinv10           = gmx_mm_invsqrt_ps(rsq10);
268             rinv11           = gmx_mm_invsqrt_ps(rsq11);
269             rinv12           = gmx_mm_invsqrt_ps(rsq12);
270             rinv20           = gmx_mm_invsqrt_ps(rsq20);
271             rinv21           = gmx_mm_invsqrt_ps(rsq21);
272             rinv22           = gmx_mm_invsqrt_ps(rsq22);
273
274             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
275             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
276             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
277             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
278             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
279             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
280             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
281             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
282             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
283
284             fjx0             = _mm_setzero_ps();
285             fjy0             = _mm_setzero_ps();
286             fjz0             = _mm_setzero_ps();
287             fjx1             = _mm_setzero_ps();
288             fjy1             = _mm_setzero_ps();
289             fjz1             = _mm_setzero_ps();
290             fjx2             = _mm_setzero_ps();
291             fjy2             = _mm_setzero_ps();
292             fjz2             = _mm_setzero_ps();
293
294             /**************************
295              * CALCULATE INTERACTIONS *
296              **************************/
297
298             if (gmx_mm_any_lt(rsq00,rcutoff2))
299             {
300
301             r00              = _mm_mul_ps(rsq00,rinv00);
302
303             /* EWALD ELECTROSTATICS */
304
305             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
306             ewrt             = _mm_mul_ps(r00,ewtabscale);
307             ewitab           = _mm_cvttps_epi32(ewrt);
308             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
309             ewitab           = _mm_slli_epi32(ewitab,2);
310             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
311             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
312             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
313             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
314             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
315             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
316             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
317             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
318             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
319
320             /* LENNARD-JONES DISPERSION/REPULSION */
321
322             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
323             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
324             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
325             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
326                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
327             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
328
329             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
330
331             /* Update potential sum for this i atom from the interaction with this j atom. */
332             velec            = _mm_and_ps(velec,cutoff_mask);
333             velecsum         = _mm_add_ps(velecsum,velec);
334             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
335             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
336
337             fscal            = _mm_add_ps(felec,fvdw);
338
339             fscal            = _mm_and_ps(fscal,cutoff_mask);
340
341             /* Calculate temporary vectorial force */
342             tx               = _mm_mul_ps(fscal,dx00);
343             ty               = _mm_mul_ps(fscal,dy00);
344             tz               = _mm_mul_ps(fscal,dz00);
345
346             /* Update vectorial force */
347             fix0             = _mm_add_ps(fix0,tx);
348             fiy0             = _mm_add_ps(fiy0,ty);
349             fiz0             = _mm_add_ps(fiz0,tz);
350
351             fjx0             = _mm_add_ps(fjx0,tx);
352             fjy0             = _mm_add_ps(fjy0,ty);
353             fjz0             = _mm_add_ps(fjz0,tz);
354             
355             }
356
357             /**************************
358              * CALCULATE INTERACTIONS *
359              **************************/
360
361             if (gmx_mm_any_lt(rsq01,rcutoff2))
362             {
363
364             r01              = _mm_mul_ps(rsq01,rinv01);
365
366             /* EWALD ELECTROSTATICS */
367
368             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
369             ewrt             = _mm_mul_ps(r01,ewtabscale);
370             ewitab           = _mm_cvttps_epi32(ewrt);
371             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
372             ewitab           = _mm_slli_epi32(ewitab,2);
373             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
374             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
375             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
376             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
377             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
378             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
379             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
380             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
381             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
382
383             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
384
385             /* Update potential sum for this i atom from the interaction with this j atom. */
386             velec            = _mm_and_ps(velec,cutoff_mask);
387             velecsum         = _mm_add_ps(velecsum,velec);
388
389             fscal            = felec;
390
391             fscal            = _mm_and_ps(fscal,cutoff_mask);
392
393             /* Calculate temporary vectorial force */
394             tx               = _mm_mul_ps(fscal,dx01);
395             ty               = _mm_mul_ps(fscal,dy01);
396             tz               = _mm_mul_ps(fscal,dz01);
397
398             /* Update vectorial force */
399             fix0             = _mm_add_ps(fix0,tx);
400             fiy0             = _mm_add_ps(fiy0,ty);
401             fiz0             = _mm_add_ps(fiz0,tz);
402
403             fjx1             = _mm_add_ps(fjx1,tx);
404             fjy1             = _mm_add_ps(fjy1,ty);
405             fjz1             = _mm_add_ps(fjz1,tz);
406             
407             }
408
409             /**************************
410              * CALCULATE INTERACTIONS *
411              **************************/
412
413             if (gmx_mm_any_lt(rsq02,rcutoff2))
414             {
415
416             r02              = _mm_mul_ps(rsq02,rinv02);
417
418             /* EWALD ELECTROSTATICS */
419
420             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
421             ewrt             = _mm_mul_ps(r02,ewtabscale);
422             ewitab           = _mm_cvttps_epi32(ewrt);
423             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
424             ewitab           = _mm_slli_epi32(ewitab,2);
425             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
426             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
427             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
428             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
429             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
430             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
431             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
432             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
433             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
434
435             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
436
437             /* Update potential sum for this i atom from the interaction with this j atom. */
438             velec            = _mm_and_ps(velec,cutoff_mask);
439             velecsum         = _mm_add_ps(velecsum,velec);
440
441             fscal            = felec;
442
443             fscal            = _mm_and_ps(fscal,cutoff_mask);
444
445             /* Calculate temporary vectorial force */
446             tx               = _mm_mul_ps(fscal,dx02);
447             ty               = _mm_mul_ps(fscal,dy02);
448             tz               = _mm_mul_ps(fscal,dz02);
449
450             /* Update vectorial force */
451             fix0             = _mm_add_ps(fix0,tx);
452             fiy0             = _mm_add_ps(fiy0,ty);
453             fiz0             = _mm_add_ps(fiz0,tz);
454
455             fjx2             = _mm_add_ps(fjx2,tx);
456             fjy2             = _mm_add_ps(fjy2,ty);
457             fjz2             = _mm_add_ps(fjz2,tz);
458             
459             }
460
461             /**************************
462              * CALCULATE INTERACTIONS *
463              **************************/
464
465             if (gmx_mm_any_lt(rsq10,rcutoff2))
466             {
467
468             r10              = _mm_mul_ps(rsq10,rinv10);
469
470             /* EWALD ELECTROSTATICS */
471
472             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
473             ewrt             = _mm_mul_ps(r10,ewtabscale);
474             ewitab           = _mm_cvttps_epi32(ewrt);
475             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
476             ewitab           = _mm_slli_epi32(ewitab,2);
477             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
478             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
479             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
480             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
481             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
482             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
483             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
484             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
485             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
486
487             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
488
489             /* Update potential sum for this i atom from the interaction with this j atom. */
490             velec            = _mm_and_ps(velec,cutoff_mask);
491             velecsum         = _mm_add_ps(velecsum,velec);
492
493             fscal            = felec;
494
495             fscal            = _mm_and_ps(fscal,cutoff_mask);
496
497             /* Calculate temporary vectorial force */
498             tx               = _mm_mul_ps(fscal,dx10);
499             ty               = _mm_mul_ps(fscal,dy10);
500             tz               = _mm_mul_ps(fscal,dz10);
501
502             /* Update vectorial force */
503             fix1             = _mm_add_ps(fix1,tx);
504             fiy1             = _mm_add_ps(fiy1,ty);
505             fiz1             = _mm_add_ps(fiz1,tz);
506
507             fjx0             = _mm_add_ps(fjx0,tx);
508             fjy0             = _mm_add_ps(fjy0,ty);
509             fjz0             = _mm_add_ps(fjz0,tz);
510             
511             }
512
513             /**************************
514              * CALCULATE INTERACTIONS *
515              **************************/
516
517             if (gmx_mm_any_lt(rsq11,rcutoff2))
518             {
519
520             r11              = _mm_mul_ps(rsq11,rinv11);
521
522             /* EWALD ELECTROSTATICS */
523
524             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
525             ewrt             = _mm_mul_ps(r11,ewtabscale);
526             ewitab           = _mm_cvttps_epi32(ewrt);
527             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
528             ewitab           = _mm_slli_epi32(ewitab,2);
529             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
530             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
531             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
532             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
533             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
534             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
535             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
536             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
537             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
538
539             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
540
541             /* Update potential sum for this i atom from the interaction with this j atom. */
542             velec            = _mm_and_ps(velec,cutoff_mask);
543             velecsum         = _mm_add_ps(velecsum,velec);
544
545             fscal            = felec;
546
547             fscal            = _mm_and_ps(fscal,cutoff_mask);
548
549             /* Calculate temporary vectorial force */
550             tx               = _mm_mul_ps(fscal,dx11);
551             ty               = _mm_mul_ps(fscal,dy11);
552             tz               = _mm_mul_ps(fscal,dz11);
553
554             /* Update vectorial force */
555             fix1             = _mm_add_ps(fix1,tx);
556             fiy1             = _mm_add_ps(fiy1,ty);
557             fiz1             = _mm_add_ps(fiz1,tz);
558
559             fjx1             = _mm_add_ps(fjx1,tx);
560             fjy1             = _mm_add_ps(fjy1,ty);
561             fjz1             = _mm_add_ps(fjz1,tz);
562             
563             }
564
565             /**************************
566              * CALCULATE INTERACTIONS *
567              **************************/
568
569             if (gmx_mm_any_lt(rsq12,rcutoff2))
570             {
571
572             r12              = _mm_mul_ps(rsq12,rinv12);
573
574             /* EWALD ELECTROSTATICS */
575
576             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
577             ewrt             = _mm_mul_ps(r12,ewtabscale);
578             ewitab           = _mm_cvttps_epi32(ewrt);
579             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
580             ewitab           = _mm_slli_epi32(ewitab,2);
581             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
582             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
583             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
584             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
585             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
586             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
587             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
588             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
589             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
590
591             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
592
593             /* Update potential sum for this i atom from the interaction with this j atom. */
594             velec            = _mm_and_ps(velec,cutoff_mask);
595             velecsum         = _mm_add_ps(velecsum,velec);
596
597             fscal            = felec;
598
599             fscal            = _mm_and_ps(fscal,cutoff_mask);
600
601             /* Calculate temporary vectorial force */
602             tx               = _mm_mul_ps(fscal,dx12);
603             ty               = _mm_mul_ps(fscal,dy12);
604             tz               = _mm_mul_ps(fscal,dz12);
605
606             /* Update vectorial force */
607             fix1             = _mm_add_ps(fix1,tx);
608             fiy1             = _mm_add_ps(fiy1,ty);
609             fiz1             = _mm_add_ps(fiz1,tz);
610
611             fjx2             = _mm_add_ps(fjx2,tx);
612             fjy2             = _mm_add_ps(fjy2,ty);
613             fjz2             = _mm_add_ps(fjz2,tz);
614             
615             }
616
617             /**************************
618              * CALCULATE INTERACTIONS *
619              **************************/
620
621             if (gmx_mm_any_lt(rsq20,rcutoff2))
622             {
623
624             r20              = _mm_mul_ps(rsq20,rinv20);
625
626             /* EWALD ELECTROSTATICS */
627
628             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
629             ewrt             = _mm_mul_ps(r20,ewtabscale);
630             ewitab           = _mm_cvttps_epi32(ewrt);
631             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
632             ewitab           = _mm_slli_epi32(ewitab,2);
633             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
634             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
635             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
636             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
637             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
638             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
639             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
640             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
641             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
642
643             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
644
645             /* Update potential sum for this i atom from the interaction with this j atom. */
646             velec            = _mm_and_ps(velec,cutoff_mask);
647             velecsum         = _mm_add_ps(velecsum,velec);
648
649             fscal            = felec;
650
651             fscal            = _mm_and_ps(fscal,cutoff_mask);
652
653             /* Calculate temporary vectorial force */
654             tx               = _mm_mul_ps(fscal,dx20);
655             ty               = _mm_mul_ps(fscal,dy20);
656             tz               = _mm_mul_ps(fscal,dz20);
657
658             /* Update vectorial force */
659             fix2             = _mm_add_ps(fix2,tx);
660             fiy2             = _mm_add_ps(fiy2,ty);
661             fiz2             = _mm_add_ps(fiz2,tz);
662
663             fjx0             = _mm_add_ps(fjx0,tx);
664             fjy0             = _mm_add_ps(fjy0,ty);
665             fjz0             = _mm_add_ps(fjz0,tz);
666             
667             }
668
669             /**************************
670              * CALCULATE INTERACTIONS *
671              **************************/
672
673             if (gmx_mm_any_lt(rsq21,rcutoff2))
674             {
675
676             r21              = _mm_mul_ps(rsq21,rinv21);
677
678             /* EWALD ELECTROSTATICS */
679
680             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
681             ewrt             = _mm_mul_ps(r21,ewtabscale);
682             ewitab           = _mm_cvttps_epi32(ewrt);
683             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
684             ewitab           = _mm_slli_epi32(ewitab,2);
685             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
686             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
687             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
688             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
689             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
690             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
691             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
692             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
693             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
694
695             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
696
697             /* Update potential sum for this i atom from the interaction with this j atom. */
698             velec            = _mm_and_ps(velec,cutoff_mask);
699             velecsum         = _mm_add_ps(velecsum,velec);
700
701             fscal            = felec;
702
703             fscal            = _mm_and_ps(fscal,cutoff_mask);
704
705             /* Calculate temporary vectorial force */
706             tx               = _mm_mul_ps(fscal,dx21);
707             ty               = _mm_mul_ps(fscal,dy21);
708             tz               = _mm_mul_ps(fscal,dz21);
709
710             /* Update vectorial force */
711             fix2             = _mm_add_ps(fix2,tx);
712             fiy2             = _mm_add_ps(fiy2,ty);
713             fiz2             = _mm_add_ps(fiz2,tz);
714
715             fjx1             = _mm_add_ps(fjx1,tx);
716             fjy1             = _mm_add_ps(fjy1,ty);
717             fjz1             = _mm_add_ps(fjz1,tz);
718             
719             }
720
721             /**************************
722              * CALCULATE INTERACTIONS *
723              **************************/
724
725             if (gmx_mm_any_lt(rsq22,rcutoff2))
726             {
727
728             r22              = _mm_mul_ps(rsq22,rinv22);
729
730             /* EWALD ELECTROSTATICS */
731
732             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
733             ewrt             = _mm_mul_ps(r22,ewtabscale);
734             ewitab           = _mm_cvttps_epi32(ewrt);
735             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
736             ewitab           = _mm_slli_epi32(ewitab,2);
737             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
738             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
739             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
740             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
741             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
742             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
743             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
744             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
745             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
746
747             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
748
749             /* Update potential sum for this i atom from the interaction with this j atom. */
750             velec            = _mm_and_ps(velec,cutoff_mask);
751             velecsum         = _mm_add_ps(velecsum,velec);
752
753             fscal            = felec;
754
755             fscal            = _mm_and_ps(fscal,cutoff_mask);
756
757             /* Calculate temporary vectorial force */
758             tx               = _mm_mul_ps(fscal,dx22);
759             ty               = _mm_mul_ps(fscal,dy22);
760             tz               = _mm_mul_ps(fscal,dz22);
761
762             /* Update vectorial force */
763             fix2             = _mm_add_ps(fix2,tx);
764             fiy2             = _mm_add_ps(fiy2,ty);
765             fiz2             = _mm_add_ps(fiz2,tz);
766
767             fjx2             = _mm_add_ps(fjx2,tx);
768             fjy2             = _mm_add_ps(fjy2,ty);
769             fjz2             = _mm_add_ps(fjz2,tz);
770             
771             }
772
773             fjptrA             = f+j_coord_offsetA;
774             fjptrB             = f+j_coord_offsetB;
775             fjptrC             = f+j_coord_offsetC;
776             fjptrD             = f+j_coord_offsetD;
777
778             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
779                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
780
781             /* Inner loop uses 432 flops */
782         }
783
784         if(jidx<j_index_end)
785         {
786
787             /* Get j neighbor index, and coordinate index */
788             jnrlistA         = jjnr[jidx];
789             jnrlistB         = jjnr[jidx+1];
790             jnrlistC         = jjnr[jidx+2];
791             jnrlistD         = jjnr[jidx+3];
792             /* Sign of each element will be negative for non-real atoms.
793              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
794              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
795              */
796             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
797             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
798             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
799             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
800             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
801             j_coord_offsetA  = DIM*jnrA;
802             j_coord_offsetB  = DIM*jnrB;
803             j_coord_offsetC  = DIM*jnrC;
804             j_coord_offsetD  = DIM*jnrD;
805
806             /* load j atom coordinates */
807             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
808                                               x+j_coord_offsetC,x+j_coord_offsetD,
809                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
810
811             /* Calculate displacement vector */
812             dx00             = _mm_sub_ps(ix0,jx0);
813             dy00             = _mm_sub_ps(iy0,jy0);
814             dz00             = _mm_sub_ps(iz0,jz0);
815             dx01             = _mm_sub_ps(ix0,jx1);
816             dy01             = _mm_sub_ps(iy0,jy1);
817             dz01             = _mm_sub_ps(iz0,jz1);
818             dx02             = _mm_sub_ps(ix0,jx2);
819             dy02             = _mm_sub_ps(iy0,jy2);
820             dz02             = _mm_sub_ps(iz0,jz2);
821             dx10             = _mm_sub_ps(ix1,jx0);
822             dy10             = _mm_sub_ps(iy1,jy0);
823             dz10             = _mm_sub_ps(iz1,jz0);
824             dx11             = _mm_sub_ps(ix1,jx1);
825             dy11             = _mm_sub_ps(iy1,jy1);
826             dz11             = _mm_sub_ps(iz1,jz1);
827             dx12             = _mm_sub_ps(ix1,jx2);
828             dy12             = _mm_sub_ps(iy1,jy2);
829             dz12             = _mm_sub_ps(iz1,jz2);
830             dx20             = _mm_sub_ps(ix2,jx0);
831             dy20             = _mm_sub_ps(iy2,jy0);
832             dz20             = _mm_sub_ps(iz2,jz0);
833             dx21             = _mm_sub_ps(ix2,jx1);
834             dy21             = _mm_sub_ps(iy2,jy1);
835             dz21             = _mm_sub_ps(iz2,jz1);
836             dx22             = _mm_sub_ps(ix2,jx2);
837             dy22             = _mm_sub_ps(iy2,jy2);
838             dz22             = _mm_sub_ps(iz2,jz2);
839
840             /* Calculate squared distance and things based on it */
841             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
842             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
843             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
844             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
845             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
846             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
847             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
848             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
849             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
850
851             rinv00           = gmx_mm_invsqrt_ps(rsq00);
852             rinv01           = gmx_mm_invsqrt_ps(rsq01);
853             rinv02           = gmx_mm_invsqrt_ps(rsq02);
854             rinv10           = gmx_mm_invsqrt_ps(rsq10);
855             rinv11           = gmx_mm_invsqrt_ps(rsq11);
856             rinv12           = gmx_mm_invsqrt_ps(rsq12);
857             rinv20           = gmx_mm_invsqrt_ps(rsq20);
858             rinv21           = gmx_mm_invsqrt_ps(rsq21);
859             rinv22           = gmx_mm_invsqrt_ps(rsq22);
860
861             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
862             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
863             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
864             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
865             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
866             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
867             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
868             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
869             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
870
871             fjx0             = _mm_setzero_ps();
872             fjy0             = _mm_setzero_ps();
873             fjz0             = _mm_setzero_ps();
874             fjx1             = _mm_setzero_ps();
875             fjy1             = _mm_setzero_ps();
876             fjz1             = _mm_setzero_ps();
877             fjx2             = _mm_setzero_ps();
878             fjy2             = _mm_setzero_ps();
879             fjz2             = _mm_setzero_ps();
880
881             /**************************
882              * CALCULATE INTERACTIONS *
883              **************************/
884
885             if (gmx_mm_any_lt(rsq00,rcutoff2))
886             {
887
888             r00              = _mm_mul_ps(rsq00,rinv00);
889             r00              = _mm_andnot_ps(dummy_mask,r00);
890
891             /* EWALD ELECTROSTATICS */
892
893             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
894             ewrt             = _mm_mul_ps(r00,ewtabscale);
895             ewitab           = _mm_cvttps_epi32(ewrt);
896             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
897             ewitab           = _mm_slli_epi32(ewitab,2);
898             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
899             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
900             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
901             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
902             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
903             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
904             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
905             velec            = _mm_mul_ps(qq00,_mm_sub_ps(_mm_sub_ps(rinv00,sh_ewald),velec));
906             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
907
908             /* LENNARD-JONES DISPERSION/REPULSION */
909
910             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
911             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
912             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
913             vvdw             = _mm_sub_ps(_mm_mul_ps( _mm_sub_ps(vvdw12 , _mm_mul_ps(c12_00,_mm_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
914                                           _mm_mul_ps( _mm_sub_ps(vvdw6,_mm_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
915             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
916
917             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
918
919             /* Update potential sum for this i atom from the interaction with this j atom. */
920             velec            = _mm_and_ps(velec,cutoff_mask);
921             velec            = _mm_andnot_ps(dummy_mask,velec);
922             velecsum         = _mm_add_ps(velecsum,velec);
923             vvdw             = _mm_and_ps(vvdw,cutoff_mask);
924             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
925             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
926
927             fscal            = _mm_add_ps(felec,fvdw);
928
929             fscal            = _mm_and_ps(fscal,cutoff_mask);
930
931             fscal            = _mm_andnot_ps(dummy_mask,fscal);
932
933             /* Calculate temporary vectorial force */
934             tx               = _mm_mul_ps(fscal,dx00);
935             ty               = _mm_mul_ps(fscal,dy00);
936             tz               = _mm_mul_ps(fscal,dz00);
937
938             /* Update vectorial force */
939             fix0             = _mm_add_ps(fix0,tx);
940             fiy0             = _mm_add_ps(fiy0,ty);
941             fiz0             = _mm_add_ps(fiz0,tz);
942
943             fjx0             = _mm_add_ps(fjx0,tx);
944             fjy0             = _mm_add_ps(fjy0,ty);
945             fjz0             = _mm_add_ps(fjz0,tz);
946             
947             }
948
949             /**************************
950              * CALCULATE INTERACTIONS *
951              **************************/
952
953             if (gmx_mm_any_lt(rsq01,rcutoff2))
954             {
955
956             r01              = _mm_mul_ps(rsq01,rinv01);
957             r01              = _mm_andnot_ps(dummy_mask,r01);
958
959             /* EWALD ELECTROSTATICS */
960
961             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
962             ewrt             = _mm_mul_ps(r01,ewtabscale);
963             ewitab           = _mm_cvttps_epi32(ewrt);
964             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
965             ewitab           = _mm_slli_epi32(ewitab,2);
966             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
967             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
968             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
969             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
970             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
971             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
972             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
973             velec            = _mm_mul_ps(qq01,_mm_sub_ps(_mm_sub_ps(rinv01,sh_ewald),velec));
974             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
975
976             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
977
978             /* Update potential sum for this i atom from the interaction with this j atom. */
979             velec            = _mm_and_ps(velec,cutoff_mask);
980             velec            = _mm_andnot_ps(dummy_mask,velec);
981             velecsum         = _mm_add_ps(velecsum,velec);
982
983             fscal            = felec;
984
985             fscal            = _mm_and_ps(fscal,cutoff_mask);
986
987             fscal            = _mm_andnot_ps(dummy_mask,fscal);
988
989             /* Calculate temporary vectorial force */
990             tx               = _mm_mul_ps(fscal,dx01);
991             ty               = _mm_mul_ps(fscal,dy01);
992             tz               = _mm_mul_ps(fscal,dz01);
993
994             /* Update vectorial force */
995             fix0             = _mm_add_ps(fix0,tx);
996             fiy0             = _mm_add_ps(fiy0,ty);
997             fiz0             = _mm_add_ps(fiz0,tz);
998
999             fjx1             = _mm_add_ps(fjx1,tx);
1000             fjy1             = _mm_add_ps(fjy1,ty);
1001             fjz1             = _mm_add_ps(fjz1,tz);
1002             
1003             }
1004
1005             /**************************
1006              * CALCULATE INTERACTIONS *
1007              **************************/
1008
1009             if (gmx_mm_any_lt(rsq02,rcutoff2))
1010             {
1011
1012             r02              = _mm_mul_ps(rsq02,rinv02);
1013             r02              = _mm_andnot_ps(dummy_mask,r02);
1014
1015             /* EWALD ELECTROSTATICS */
1016
1017             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1018             ewrt             = _mm_mul_ps(r02,ewtabscale);
1019             ewitab           = _mm_cvttps_epi32(ewrt);
1020             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1021             ewitab           = _mm_slli_epi32(ewitab,2);
1022             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1023             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1024             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1025             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1026             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1027             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1028             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1029             velec            = _mm_mul_ps(qq02,_mm_sub_ps(_mm_sub_ps(rinv02,sh_ewald),velec));
1030             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1031
1032             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1033
1034             /* Update potential sum for this i atom from the interaction with this j atom. */
1035             velec            = _mm_and_ps(velec,cutoff_mask);
1036             velec            = _mm_andnot_ps(dummy_mask,velec);
1037             velecsum         = _mm_add_ps(velecsum,velec);
1038
1039             fscal            = felec;
1040
1041             fscal            = _mm_and_ps(fscal,cutoff_mask);
1042
1043             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1044
1045             /* Calculate temporary vectorial force */
1046             tx               = _mm_mul_ps(fscal,dx02);
1047             ty               = _mm_mul_ps(fscal,dy02);
1048             tz               = _mm_mul_ps(fscal,dz02);
1049
1050             /* Update vectorial force */
1051             fix0             = _mm_add_ps(fix0,tx);
1052             fiy0             = _mm_add_ps(fiy0,ty);
1053             fiz0             = _mm_add_ps(fiz0,tz);
1054
1055             fjx2             = _mm_add_ps(fjx2,tx);
1056             fjy2             = _mm_add_ps(fjy2,ty);
1057             fjz2             = _mm_add_ps(fjz2,tz);
1058             
1059             }
1060
1061             /**************************
1062              * CALCULATE INTERACTIONS *
1063              **************************/
1064
1065             if (gmx_mm_any_lt(rsq10,rcutoff2))
1066             {
1067
1068             r10              = _mm_mul_ps(rsq10,rinv10);
1069             r10              = _mm_andnot_ps(dummy_mask,r10);
1070
1071             /* EWALD ELECTROSTATICS */
1072
1073             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1074             ewrt             = _mm_mul_ps(r10,ewtabscale);
1075             ewitab           = _mm_cvttps_epi32(ewrt);
1076             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1077             ewitab           = _mm_slli_epi32(ewitab,2);
1078             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1079             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1080             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1081             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1082             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1083             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1084             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1085             velec            = _mm_mul_ps(qq10,_mm_sub_ps(_mm_sub_ps(rinv10,sh_ewald),velec));
1086             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1087
1088             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1089
1090             /* Update potential sum for this i atom from the interaction with this j atom. */
1091             velec            = _mm_and_ps(velec,cutoff_mask);
1092             velec            = _mm_andnot_ps(dummy_mask,velec);
1093             velecsum         = _mm_add_ps(velecsum,velec);
1094
1095             fscal            = felec;
1096
1097             fscal            = _mm_and_ps(fscal,cutoff_mask);
1098
1099             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1100
1101             /* Calculate temporary vectorial force */
1102             tx               = _mm_mul_ps(fscal,dx10);
1103             ty               = _mm_mul_ps(fscal,dy10);
1104             tz               = _mm_mul_ps(fscal,dz10);
1105
1106             /* Update vectorial force */
1107             fix1             = _mm_add_ps(fix1,tx);
1108             fiy1             = _mm_add_ps(fiy1,ty);
1109             fiz1             = _mm_add_ps(fiz1,tz);
1110
1111             fjx0             = _mm_add_ps(fjx0,tx);
1112             fjy0             = _mm_add_ps(fjy0,ty);
1113             fjz0             = _mm_add_ps(fjz0,tz);
1114             
1115             }
1116
1117             /**************************
1118              * CALCULATE INTERACTIONS *
1119              **************************/
1120
1121             if (gmx_mm_any_lt(rsq11,rcutoff2))
1122             {
1123
1124             r11              = _mm_mul_ps(rsq11,rinv11);
1125             r11              = _mm_andnot_ps(dummy_mask,r11);
1126
1127             /* EWALD ELECTROSTATICS */
1128
1129             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1130             ewrt             = _mm_mul_ps(r11,ewtabscale);
1131             ewitab           = _mm_cvttps_epi32(ewrt);
1132             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1133             ewitab           = _mm_slli_epi32(ewitab,2);
1134             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1135             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1136             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1137             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1138             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1139             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1140             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1141             velec            = _mm_mul_ps(qq11,_mm_sub_ps(_mm_sub_ps(rinv11,sh_ewald),velec));
1142             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1143
1144             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1145
1146             /* Update potential sum for this i atom from the interaction with this j atom. */
1147             velec            = _mm_and_ps(velec,cutoff_mask);
1148             velec            = _mm_andnot_ps(dummy_mask,velec);
1149             velecsum         = _mm_add_ps(velecsum,velec);
1150
1151             fscal            = felec;
1152
1153             fscal            = _mm_and_ps(fscal,cutoff_mask);
1154
1155             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1156
1157             /* Calculate temporary vectorial force */
1158             tx               = _mm_mul_ps(fscal,dx11);
1159             ty               = _mm_mul_ps(fscal,dy11);
1160             tz               = _mm_mul_ps(fscal,dz11);
1161
1162             /* Update vectorial force */
1163             fix1             = _mm_add_ps(fix1,tx);
1164             fiy1             = _mm_add_ps(fiy1,ty);
1165             fiz1             = _mm_add_ps(fiz1,tz);
1166
1167             fjx1             = _mm_add_ps(fjx1,tx);
1168             fjy1             = _mm_add_ps(fjy1,ty);
1169             fjz1             = _mm_add_ps(fjz1,tz);
1170             
1171             }
1172
1173             /**************************
1174              * CALCULATE INTERACTIONS *
1175              **************************/
1176
1177             if (gmx_mm_any_lt(rsq12,rcutoff2))
1178             {
1179
1180             r12              = _mm_mul_ps(rsq12,rinv12);
1181             r12              = _mm_andnot_ps(dummy_mask,r12);
1182
1183             /* EWALD ELECTROSTATICS */
1184
1185             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1186             ewrt             = _mm_mul_ps(r12,ewtabscale);
1187             ewitab           = _mm_cvttps_epi32(ewrt);
1188             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1189             ewitab           = _mm_slli_epi32(ewitab,2);
1190             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1191             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1192             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1193             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1194             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1195             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1196             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1197             velec            = _mm_mul_ps(qq12,_mm_sub_ps(_mm_sub_ps(rinv12,sh_ewald),velec));
1198             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1199
1200             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1201
1202             /* Update potential sum for this i atom from the interaction with this j atom. */
1203             velec            = _mm_and_ps(velec,cutoff_mask);
1204             velec            = _mm_andnot_ps(dummy_mask,velec);
1205             velecsum         = _mm_add_ps(velecsum,velec);
1206
1207             fscal            = felec;
1208
1209             fscal            = _mm_and_ps(fscal,cutoff_mask);
1210
1211             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1212
1213             /* Calculate temporary vectorial force */
1214             tx               = _mm_mul_ps(fscal,dx12);
1215             ty               = _mm_mul_ps(fscal,dy12);
1216             tz               = _mm_mul_ps(fscal,dz12);
1217
1218             /* Update vectorial force */
1219             fix1             = _mm_add_ps(fix1,tx);
1220             fiy1             = _mm_add_ps(fiy1,ty);
1221             fiz1             = _mm_add_ps(fiz1,tz);
1222
1223             fjx2             = _mm_add_ps(fjx2,tx);
1224             fjy2             = _mm_add_ps(fjy2,ty);
1225             fjz2             = _mm_add_ps(fjz2,tz);
1226             
1227             }
1228
1229             /**************************
1230              * CALCULATE INTERACTIONS *
1231              **************************/
1232
1233             if (gmx_mm_any_lt(rsq20,rcutoff2))
1234             {
1235
1236             r20              = _mm_mul_ps(rsq20,rinv20);
1237             r20              = _mm_andnot_ps(dummy_mask,r20);
1238
1239             /* EWALD ELECTROSTATICS */
1240
1241             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1242             ewrt             = _mm_mul_ps(r20,ewtabscale);
1243             ewitab           = _mm_cvttps_epi32(ewrt);
1244             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1245             ewitab           = _mm_slli_epi32(ewitab,2);
1246             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1247             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1248             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1249             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1250             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1251             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1252             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1253             velec            = _mm_mul_ps(qq20,_mm_sub_ps(_mm_sub_ps(rinv20,sh_ewald),velec));
1254             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1255
1256             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1257
1258             /* Update potential sum for this i atom from the interaction with this j atom. */
1259             velec            = _mm_and_ps(velec,cutoff_mask);
1260             velec            = _mm_andnot_ps(dummy_mask,velec);
1261             velecsum         = _mm_add_ps(velecsum,velec);
1262
1263             fscal            = felec;
1264
1265             fscal            = _mm_and_ps(fscal,cutoff_mask);
1266
1267             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1268
1269             /* Calculate temporary vectorial force */
1270             tx               = _mm_mul_ps(fscal,dx20);
1271             ty               = _mm_mul_ps(fscal,dy20);
1272             tz               = _mm_mul_ps(fscal,dz20);
1273
1274             /* Update vectorial force */
1275             fix2             = _mm_add_ps(fix2,tx);
1276             fiy2             = _mm_add_ps(fiy2,ty);
1277             fiz2             = _mm_add_ps(fiz2,tz);
1278
1279             fjx0             = _mm_add_ps(fjx0,tx);
1280             fjy0             = _mm_add_ps(fjy0,ty);
1281             fjz0             = _mm_add_ps(fjz0,tz);
1282             
1283             }
1284
1285             /**************************
1286              * CALCULATE INTERACTIONS *
1287              **************************/
1288
1289             if (gmx_mm_any_lt(rsq21,rcutoff2))
1290             {
1291
1292             r21              = _mm_mul_ps(rsq21,rinv21);
1293             r21              = _mm_andnot_ps(dummy_mask,r21);
1294
1295             /* EWALD ELECTROSTATICS */
1296
1297             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1298             ewrt             = _mm_mul_ps(r21,ewtabscale);
1299             ewitab           = _mm_cvttps_epi32(ewrt);
1300             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1301             ewitab           = _mm_slli_epi32(ewitab,2);
1302             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1303             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1304             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1305             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1306             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1307             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1308             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1309             velec            = _mm_mul_ps(qq21,_mm_sub_ps(_mm_sub_ps(rinv21,sh_ewald),velec));
1310             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1311
1312             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
1313
1314             /* Update potential sum for this i atom from the interaction with this j atom. */
1315             velec            = _mm_and_ps(velec,cutoff_mask);
1316             velec            = _mm_andnot_ps(dummy_mask,velec);
1317             velecsum         = _mm_add_ps(velecsum,velec);
1318
1319             fscal            = felec;
1320
1321             fscal            = _mm_and_ps(fscal,cutoff_mask);
1322
1323             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1324
1325             /* Calculate temporary vectorial force */
1326             tx               = _mm_mul_ps(fscal,dx21);
1327             ty               = _mm_mul_ps(fscal,dy21);
1328             tz               = _mm_mul_ps(fscal,dz21);
1329
1330             /* Update vectorial force */
1331             fix2             = _mm_add_ps(fix2,tx);
1332             fiy2             = _mm_add_ps(fiy2,ty);
1333             fiz2             = _mm_add_ps(fiz2,tz);
1334
1335             fjx1             = _mm_add_ps(fjx1,tx);
1336             fjy1             = _mm_add_ps(fjy1,ty);
1337             fjz1             = _mm_add_ps(fjz1,tz);
1338             
1339             }
1340
1341             /**************************
1342              * CALCULATE INTERACTIONS *
1343              **************************/
1344
1345             if (gmx_mm_any_lt(rsq22,rcutoff2))
1346             {
1347
1348             r22              = _mm_mul_ps(rsq22,rinv22);
1349             r22              = _mm_andnot_ps(dummy_mask,r22);
1350
1351             /* EWALD ELECTROSTATICS */
1352
1353             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1354             ewrt             = _mm_mul_ps(r22,ewtabscale);
1355             ewitab           = _mm_cvttps_epi32(ewrt);
1356             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1357             ewitab           = _mm_slli_epi32(ewitab,2);
1358             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1359             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1360             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1361             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1362             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1363             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1364             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1365             velec            = _mm_mul_ps(qq22,_mm_sub_ps(_mm_sub_ps(rinv22,sh_ewald),velec));
1366             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1367
1368             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
1369
1370             /* Update potential sum for this i atom from the interaction with this j atom. */
1371             velec            = _mm_and_ps(velec,cutoff_mask);
1372             velec            = _mm_andnot_ps(dummy_mask,velec);
1373             velecsum         = _mm_add_ps(velecsum,velec);
1374
1375             fscal            = felec;
1376
1377             fscal            = _mm_and_ps(fscal,cutoff_mask);
1378
1379             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1380
1381             /* Calculate temporary vectorial force */
1382             tx               = _mm_mul_ps(fscal,dx22);
1383             ty               = _mm_mul_ps(fscal,dy22);
1384             tz               = _mm_mul_ps(fscal,dz22);
1385
1386             /* Update vectorial force */
1387             fix2             = _mm_add_ps(fix2,tx);
1388             fiy2             = _mm_add_ps(fiy2,ty);
1389             fiz2             = _mm_add_ps(fiz2,tz);
1390
1391             fjx2             = _mm_add_ps(fjx2,tx);
1392             fjy2             = _mm_add_ps(fjy2,ty);
1393             fjz2             = _mm_add_ps(fjz2,tz);
1394             
1395             }
1396
1397             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1398             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1399             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1400             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1401
1402             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1403                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1404
1405             /* Inner loop uses 441 flops */
1406         }
1407
1408         /* End of innermost loop */
1409
1410         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1411                                               f+i_coord_offset,fshift+i_shift_offset);
1412
1413         ggid                        = gid[iidx];
1414         /* Update potential energies */
1415         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1416         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1417
1418         /* Increment number of inner iterations */
1419         inneriter                  += j_index_end - j_index_start;
1420
1421         /* Outer loop uses 20 flops */
1422     }
1423
1424     /* Increment number of outer iterations */
1425     outeriter        += nri;
1426
1427     /* Update outer/inner flops */
1428
1429     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*441);
1430 }
1431 /*
1432  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single
1433  * Electrostatics interaction: Ewald
1434  * VdW interaction:            LennardJones
1435  * Geometry:                   Water3-Water3
1436  * Calculate force/pot:        Force
1437  */
1438 void
1439 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_sse2_single
1440                     (t_nblist * gmx_restrict                nlist,
1441                      rvec * gmx_restrict                    xx,
1442                      rvec * gmx_restrict                    ff,
1443                      t_forcerec * gmx_restrict              fr,
1444                      t_mdatoms * gmx_restrict               mdatoms,
1445                      nb_kernel_data_t * gmx_restrict        kernel_data,
1446                      t_nrnb * gmx_restrict                  nrnb)
1447 {
1448     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1449      * just 0 for non-waters.
1450      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1451      * jnr indices corresponding to data put in the four positions in the SIMD register.
1452      */
1453     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1454     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1455     int              jnrA,jnrB,jnrC,jnrD;
1456     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1457     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1458     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1459     real             rcutoff_scalar;
1460     real             *shiftvec,*fshift,*x,*f;
1461     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1462     real             scratch[4*DIM];
1463     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1464     int              vdwioffset0;
1465     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1466     int              vdwioffset1;
1467     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1468     int              vdwioffset2;
1469     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1470     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1471     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1472     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1473     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1474     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1475     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1476     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1477     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1478     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1479     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1480     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1481     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1482     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1483     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1484     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1485     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1486     real             *charge;
1487     int              nvdwtype;
1488     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1489     int              *vdwtype;
1490     real             *vdwparam;
1491     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1492     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1493     __m128i          ewitab;
1494     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1495     real             *ewtab;
1496     __m128           dummy_mask,cutoff_mask;
1497     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1498     __m128           one     = _mm_set1_ps(1.0);
1499     __m128           two     = _mm_set1_ps(2.0);
1500     x                = xx[0];
1501     f                = ff[0];
1502
1503     nri              = nlist->nri;
1504     iinr             = nlist->iinr;
1505     jindex           = nlist->jindex;
1506     jjnr             = nlist->jjnr;
1507     shiftidx         = nlist->shift;
1508     gid              = nlist->gid;
1509     shiftvec         = fr->shift_vec[0];
1510     fshift           = fr->fshift[0];
1511     facel            = _mm_set1_ps(fr->epsfac);
1512     charge           = mdatoms->chargeA;
1513     nvdwtype         = fr->ntype;
1514     vdwparam         = fr->nbfp;
1515     vdwtype          = mdatoms->typeA;
1516
1517     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1518     ewtab            = fr->ic->tabq_coul_F;
1519     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1520     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1521
1522     /* Setup water-specific parameters */
1523     inr              = nlist->iinr[0];
1524     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1525     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1526     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1527     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1528
1529     jq0              = _mm_set1_ps(charge[inr+0]);
1530     jq1              = _mm_set1_ps(charge[inr+1]);
1531     jq2              = _mm_set1_ps(charge[inr+2]);
1532     vdwjidx0A        = 2*vdwtype[inr+0];
1533     qq00             = _mm_mul_ps(iq0,jq0);
1534     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1535     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1536     qq01             = _mm_mul_ps(iq0,jq1);
1537     qq02             = _mm_mul_ps(iq0,jq2);
1538     qq10             = _mm_mul_ps(iq1,jq0);
1539     qq11             = _mm_mul_ps(iq1,jq1);
1540     qq12             = _mm_mul_ps(iq1,jq2);
1541     qq20             = _mm_mul_ps(iq2,jq0);
1542     qq21             = _mm_mul_ps(iq2,jq1);
1543     qq22             = _mm_mul_ps(iq2,jq2);
1544
1545     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1546     rcutoff_scalar   = fr->rcoulomb;
1547     rcutoff          = _mm_set1_ps(rcutoff_scalar);
1548     rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);
1549
1550     sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
1551     rvdw             = _mm_set1_ps(fr->rvdw);
1552
1553     /* Avoid stupid compiler warnings */
1554     jnrA = jnrB = jnrC = jnrD = 0;
1555     j_coord_offsetA = 0;
1556     j_coord_offsetB = 0;
1557     j_coord_offsetC = 0;
1558     j_coord_offsetD = 0;
1559
1560     outeriter        = 0;
1561     inneriter        = 0;
1562
1563     for(iidx=0;iidx<4*DIM;iidx++)
1564     {
1565         scratch[iidx] = 0.0;
1566     }  
1567
1568     /* Start outer loop over neighborlists */
1569     for(iidx=0; iidx<nri; iidx++)
1570     {
1571         /* Load shift vector for this list */
1572         i_shift_offset   = DIM*shiftidx[iidx];
1573
1574         /* Load limits for loop over neighbors */
1575         j_index_start    = jindex[iidx];
1576         j_index_end      = jindex[iidx+1];
1577
1578         /* Get outer coordinate index */
1579         inr              = iinr[iidx];
1580         i_coord_offset   = DIM*inr;
1581
1582         /* Load i particle coords and add shift vector */
1583         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1584                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1585         
1586         fix0             = _mm_setzero_ps();
1587         fiy0             = _mm_setzero_ps();
1588         fiz0             = _mm_setzero_ps();
1589         fix1             = _mm_setzero_ps();
1590         fiy1             = _mm_setzero_ps();
1591         fiz1             = _mm_setzero_ps();
1592         fix2             = _mm_setzero_ps();
1593         fiy2             = _mm_setzero_ps();
1594         fiz2             = _mm_setzero_ps();
1595
1596         /* Start inner kernel loop */
1597         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1598         {
1599
1600             /* Get j neighbor index, and coordinate index */
1601             jnrA             = jjnr[jidx];
1602             jnrB             = jjnr[jidx+1];
1603             jnrC             = jjnr[jidx+2];
1604             jnrD             = jjnr[jidx+3];
1605             j_coord_offsetA  = DIM*jnrA;
1606             j_coord_offsetB  = DIM*jnrB;
1607             j_coord_offsetC  = DIM*jnrC;
1608             j_coord_offsetD  = DIM*jnrD;
1609
1610             /* load j atom coordinates */
1611             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1612                                               x+j_coord_offsetC,x+j_coord_offsetD,
1613                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1614
1615             /* Calculate displacement vector */
1616             dx00             = _mm_sub_ps(ix0,jx0);
1617             dy00             = _mm_sub_ps(iy0,jy0);
1618             dz00             = _mm_sub_ps(iz0,jz0);
1619             dx01             = _mm_sub_ps(ix0,jx1);
1620             dy01             = _mm_sub_ps(iy0,jy1);
1621             dz01             = _mm_sub_ps(iz0,jz1);
1622             dx02             = _mm_sub_ps(ix0,jx2);
1623             dy02             = _mm_sub_ps(iy0,jy2);
1624             dz02             = _mm_sub_ps(iz0,jz2);
1625             dx10             = _mm_sub_ps(ix1,jx0);
1626             dy10             = _mm_sub_ps(iy1,jy0);
1627             dz10             = _mm_sub_ps(iz1,jz0);
1628             dx11             = _mm_sub_ps(ix1,jx1);
1629             dy11             = _mm_sub_ps(iy1,jy1);
1630             dz11             = _mm_sub_ps(iz1,jz1);
1631             dx12             = _mm_sub_ps(ix1,jx2);
1632             dy12             = _mm_sub_ps(iy1,jy2);
1633             dz12             = _mm_sub_ps(iz1,jz2);
1634             dx20             = _mm_sub_ps(ix2,jx0);
1635             dy20             = _mm_sub_ps(iy2,jy0);
1636             dz20             = _mm_sub_ps(iz2,jz0);
1637             dx21             = _mm_sub_ps(ix2,jx1);
1638             dy21             = _mm_sub_ps(iy2,jy1);
1639             dz21             = _mm_sub_ps(iz2,jz1);
1640             dx22             = _mm_sub_ps(ix2,jx2);
1641             dy22             = _mm_sub_ps(iy2,jy2);
1642             dz22             = _mm_sub_ps(iz2,jz2);
1643
1644             /* Calculate squared distance and things based on it */
1645             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1646             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1647             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1648             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1649             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1650             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1651             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1652             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1653             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1654
1655             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1656             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1657             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1658             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1659             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1660             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1661             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1662             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1663             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1664
1665             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1666             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1667             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1668             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1669             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1670             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1671             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1672             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1673             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1674
1675             fjx0             = _mm_setzero_ps();
1676             fjy0             = _mm_setzero_ps();
1677             fjz0             = _mm_setzero_ps();
1678             fjx1             = _mm_setzero_ps();
1679             fjy1             = _mm_setzero_ps();
1680             fjz1             = _mm_setzero_ps();
1681             fjx2             = _mm_setzero_ps();
1682             fjy2             = _mm_setzero_ps();
1683             fjz2             = _mm_setzero_ps();
1684
1685             /**************************
1686              * CALCULATE INTERACTIONS *
1687              **************************/
1688
1689             if (gmx_mm_any_lt(rsq00,rcutoff2))
1690             {
1691
1692             r00              = _mm_mul_ps(rsq00,rinv00);
1693
1694             /* EWALD ELECTROSTATICS */
1695
1696             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1697             ewrt             = _mm_mul_ps(r00,ewtabscale);
1698             ewitab           = _mm_cvttps_epi32(ewrt);
1699             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1700             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1701                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1702                                          &ewtabF,&ewtabFn);
1703             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1704             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1705
1706             /* LENNARD-JONES DISPERSION/REPULSION */
1707
1708             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1709             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1710
1711             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
1712
1713             fscal            = _mm_add_ps(felec,fvdw);
1714
1715             fscal            = _mm_and_ps(fscal,cutoff_mask);
1716
1717             /* Calculate temporary vectorial force */
1718             tx               = _mm_mul_ps(fscal,dx00);
1719             ty               = _mm_mul_ps(fscal,dy00);
1720             tz               = _mm_mul_ps(fscal,dz00);
1721
1722             /* Update vectorial force */
1723             fix0             = _mm_add_ps(fix0,tx);
1724             fiy0             = _mm_add_ps(fiy0,ty);
1725             fiz0             = _mm_add_ps(fiz0,tz);
1726
1727             fjx0             = _mm_add_ps(fjx0,tx);
1728             fjy0             = _mm_add_ps(fjy0,ty);
1729             fjz0             = _mm_add_ps(fjz0,tz);
1730             
1731             }
1732
1733             /**************************
1734              * CALCULATE INTERACTIONS *
1735              **************************/
1736
1737             if (gmx_mm_any_lt(rsq01,rcutoff2))
1738             {
1739
1740             r01              = _mm_mul_ps(rsq01,rinv01);
1741
1742             /* EWALD ELECTROSTATICS */
1743
1744             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1745             ewrt             = _mm_mul_ps(r01,ewtabscale);
1746             ewitab           = _mm_cvttps_epi32(ewrt);
1747             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1748             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1749                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1750                                          &ewtabF,&ewtabFn);
1751             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1752             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1753
1754             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
1755
1756             fscal            = felec;
1757
1758             fscal            = _mm_and_ps(fscal,cutoff_mask);
1759
1760             /* Calculate temporary vectorial force */
1761             tx               = _mm_mul_ps(fscal,dx01);
1762             ty               = _mm_mul_ps(fscal,dy01);
1763             tz               = _mm_mul_ps(fscal,dz01);
1764
1765             /* Update vectorial force */
1766             fix0             = _mm_add_ps(fix0,tx);
1767             fiy0             = _mm_add_ps(fiy0,ty);
1768             fiz0             = _mm_add_ps(fiz0,tz);
1769
1770             fjx1             = _mm_add_ps(fjx1,tx);
1771             fjy1             = _mm_add_ps(fjy1,ty);
1772             fjz1             = _mm_add_ps(fjz1,tz);
1773             
1774             }
1775
1776             /**************************
1777              * CALCULATE INTERACTIONS *
1778              **************************/
1779
1780             if (gmx_mm_any_lt(rsq02,rcutoff2))
1781             {
1782
1783             r02              = _mm_mul_ps(rsq02,rinv02);
1784
1785             /* EWALD ELECTROSTATICS */
1786
1787             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1788             ewrt             = _mm_mul_ps(r02,ewtabscale);
1789             ewitab           = _mm_cvttps_epi32(ewrt);
1790             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1791             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1792                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1793                                          &ewtabF,&ewtabFn);
1794             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1795             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1796
1797             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
1798
1799             fscal            = felec;
1800
1801             fscal            = _mm_and_ps(fscal,cutoff_mask);
1802
1803             /* Calculate temporary vectorial force */
1804             tx               = _mm_mul_ps(fscal,dx02);
1805             ty               = _mm_mul_ps(fscal,dy02);
1806             tz               = _mm_mul_ps(fscal,dz02);
1807
1808             /* Update vectorial force */
1809             fix0             = _mm_add_ps(fix0,tx);
1810             fiy0             = _mm_add_ps(fiy0,ty);
1811             fiz0             = _mm_add_ps(fiz0,tz);
1812
1813             fjx2             = _mm_add_ps(fjx2,tx);
1814             fjy2             = _mm_add_ps(fjy2,ty);
1815             fjz2             = _mm_add_ps(fjz2,tz);
1816             
1817             }
1818
1819             /**************************
1820              * CALCULATE INTERACTIONS *
1821              **************************/
1822
1823             if (gmx_mm_any_lt(rsq10,rcutoff2))
1824             {
1825
1826             r10              = _mm_mul_ps(rsq10,rinv10);
1827
1828             /* EWALD ELECTROSTATICS */
1829
1830             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1831             ewrt             = _mm_mul_ps(r10,ewtabscale);
1832             ewitab           = _mm_cvttps_epi32(ewrt);
1833             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1834             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1835                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1836                                          &ewtabF,&ewtabFn);
1837             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1838             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1839
1840             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
1841
1842             fscal            = felec;
1843
1844             fscal            = _mm_and_ps(fscal,cutoff_mask);
1845
1846             /* Calculate temporary vectorial force */
1847             tx               = _mm_mul_ps(fscal,dx10);
1848             ty               = _mm_mul_ps(fscal,dy10);
1849             tz               = _mm_mul_ps(fscal,dz10);
1850
1851             /* Update vectorial force */
1852             fix1             = _mm_add_ps(fix1,tx);
1853             fiy1             = _mm_add_ps(fiy1,ty);
1854             fiz1             = _mm_add_ps(fiz1,tz);
1855
1856             fjx0             = _mm_add_ps(fjx0,tx);
1857             fjy0             = _mm_add_ps(fjy0,ty);
1858             fjz0             = _mm_add_ps(fjz0,tz);
1859             
1860             }
1861
1862             /**************************
1863              * CALCULATE INTERACTIONS *
1864              **************************/
1865
1866             if (gmx_mm_any_lt(rsq11,rcutoff2))
1867             {
1868
1869             r11              = _mm_mul_ps(rsq11,rinv11);
1870
1871             /* EWALD ELECTROSTATICS */
1872
1873             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1874             ewrt             = _mm_mul_ps(r11,ewtabscale);
1875             ewitab           = _mm_cvttps_epi32(ewrt);
1876             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1877             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1878                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1879                                          &ewtabF,&ewtabFn);
1880             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1881             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1882
1883             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
1884
1885             fscal            = felec;
1886
1887             fscal            = _mm_and_ps(fscal,cutoff_mask);
1888
1889             /* Calculate temporary vectorial force */
1890             tx               = _mm_mul_ps(fscal,dx11);
1891             ty               = _mm_mul_ps(fscal,dy11);
1892             tz               = _mm_mul_ps(fscal,dz11);
1893
1894             /* Update vectorial force */
1895             fix1             = _mm_add_ps(fix1,tx);
1896             fiy1             = _mm_add_ps(fiy1,ty);
1897             fiz1             = _mm_add_ps(fiz1,tz);
1898
1899             fjx1             = _mm_add_ps(fjx1,tx);
1900             fjy1             = _mm_add_ps(fjy1,ty);
1901             fjz1             = _mm_add_ps(fjz1,tz);
1902             
1903             }
1904
1905             /**************************
1906              * CALCULATE INTERACTIONS *
1907              **************************/
1908
1909             if (gmx_mm_any_lt(rsq12,rcutoff2))
1910             {
1911
1912             r12              = _mm_mul_ps(rsq12,rinv12);
1913
1914             /* EWALD ELECTROSTATICS */
1915
1916             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1917             ewrt             = _mm_mul_ps(r12,ewtabscale);
1918             ewitab           = _mm_cvttps_epi32(ewrt);
1919             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1920             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1921                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1922                                          &ewtabF,&ewtabFn);
1923             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1924             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1925
1926             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
1927
1928             fscal            = felec;
1929
1930             fscal            = _mm_and_ps(fscal,cutoff_mask);
1931
1932             /* Calculate temporary vectorial force */
1933             tx               = _mm_mul_ps(fscal,dx12);
1934             ty               = _mm_mul_ps(fscal,dy12);
1935             tz               = _mm_mul_ps(fscal,dz12);
1936
1937             /* Update vectorial force */
1938             fix1             = _mm_add_ps(fix1,tx);
1939             fiy1             = _mm_add_ps(fiy1,ty);
1940             fiz1             = _mm_add_ps(fiz1,tz);
1941
1942             fjx2             = _mm_add_ps(fjx2,tx);
1943             fjy2             = _mm_add_ps(fjy2,ty);
1944             fjz2             = _mm_add_ps(fjz2,tz);
1945             
1946             }
1947
1948             /**************************
1949              * CALCULATE INTERACTIONS *
1950              **************************/
1951
1952             if (gmx_mm_any_lt(rsq20,rcutoff2))
1953             {
1954
1955             r20              = _mm_mul_ps(rsq20,rinv20);
1956
1957             /* EWALD ELECTROSTATICS */
1958
1959             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1960             ewrt             = _mm_mul_ps(r20,ewtabscale);
1961             ewitab           = _mm_cvttps_epi32(ewrt);
1962             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1963             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1964                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1965                                          &ewtabF,&ewtabFn);
1966             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1967             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1968
1969             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
1970
1971             fscal            = felec;
1972
1973             fscal            = _mm_and_ps(fscal,cutoff_mask);
1974
1975             /* Calculate temporary vectorial force */
1976             tx               = _mm_mul_ps(fscal,dx20);
1977             ty               = _mm_mul_ps(fscal,dy20);
1978             tz               = _mm_mul_ps(fscal,dz20);
1979
1980             /* Update vectorial force */
1981             fix2             = _mm_add_ps(fix2,tx);
1982             fiy2             = _mm_add_ps(fiy2,ty);
1983             fiz2             = _mm_add_ps(fiz2,tz);
1984
1985             fjx0             = _mm_add_ps(fjx0,tx);
1986             fjy0             = _mm_add_ps(fjy0,ty);
1987             fjz0             = _mm_add_ps(fjz0,tz);
1988             
1989             }
1990
1991             /**************************
1992              * CALCULATE INTERACTIONS *
1993              **************************/
1994
1995             if (gmx_mm_any_lt(rsq21,rcutoff2))
1996             {
1997
1998             r21              = _mm_mul_ps(rsq21,rinv21);
1999
2000             /* EWALD ELECTROSTATICS */
2001
2002             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2003             ewrt             = _mm_mul_ps(r21,ewtabscale);
2004             ewitab           = _mm_cvttps_epi32(ewrt);
2005             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2006             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2007                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2008                                          &ewtabF,&ewtabFn);
2009             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2010             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2011
2012             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2013
2014             fscal            = felec;
2015
2016             fscal            = _mm_and_ps(fscal,cutoff_mask);
2017
2018             /* Calculate temporary vectorial force */
2019             tx               = _mm_mul_ps(fscal,dx21);
2020             ty               = _mm_mul_ps(fscal,dy21);
2021             tz               = _mm_mul_ps(fscal,dz21);
2022
2023             /* Update vectorial force */
2024             fix2             = _mm_add_ps(fix2,tx);
2025             fiy2             = _mm_add_ps(fiy2,ty);
2026             fiz2             = _mm_add_ps(fiz2,tz);
2027
2028             fjx1             = _mm_add_ps(fjx1,tx);
2029             fjy1             = _mm_add_ps(fjy1,ty);
2030             fjz1             = _mm_add_ps(fjz1,tz);
2031             
2032             }
2033
2034             /**************************
2035              * CALCULATE INTERACTIONS *
2036              **************************/
2037
2038             if (gmx_mm_any_lt(rsq22,rcutoff2))
2039             {
2040
2041             r22              = _mm_mul_ps(rsq22,rinv22);
2042
2043             /* EWALD ELECTROSTATICS */
2044
2045             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2046             ewrt             = _mm_mul_ps(r22,ewtabscale);
2047             ewitab           = _mm_cvttps_epi32(ewrt);
2048             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2049             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2050                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2051                                          &ewtabF,&ewtabFn);
2052             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2053             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2054
2055             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2056
2057             fscal            = felec;
2058
2059             fscal            = _mm_and_ps(fscal,cutoff_mask);
2060
2061             /* Calculate temporary vectorial force */
2062             tx               = _mm_mul_ps(fscal,dx22);
2063             ty               = _mm_mul_ps(fscal,dy22);
2064             tz               = _mm_mul_ps(fscal,dz22);
2065
2066             /* Update vectorial force */
2067             fix2             = _mm_add_ps(fix2,tx);
2068             fiy2             = _mm_add_ps(fiy2,ty);
2069             fiz2             = _mm_add_ps(fiz2,tz);
2070
2071             fjx2             = _mm_add_ps(fjx2,tx);
2072             fjy2             = _mm_add_ps(fjy2,ty);
2073             fjz2             = _mm_add_ps(fjz2,tz);
2074             
2075             }
2076
2077             fjptrA             = f+j_coord_offsetA;
2078             fjptrB             = f+j_coord_offsetB;
2079             fjptrC             = f+j_coord_offsetC;
2080             fjptrD             = f+j_coord_offsetD;
2081
2082             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2083                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2084
2085             /* Inner loop uses 358 flops */
2086         }
2087
2088         if(jidx<j_index_end)
2089         {
2090
2091             /* Get j neighbor index, and coordinate index */
2092             jnrlistA         = jjnr[jidx];
2093             jnrlistB         = jjnr[jidx+1];
2094             jnrlistC         = jjnr[jidx+2];
2095             jnrlistD         = jjnr[jidx+3];
2096             /* Sign of each element will be negative for non-real atoms.
2097              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2098              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2099              */
2100             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
2101             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2102             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2103             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2104             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2105             j_coord_offsetA  = DIM*jnrA;
2106             j_coord_offsetB  = DIM*jnrB;
2107             j_coord_offsetC  = DIM*jnrC;
2108             j_coord_offsetD  = DIM*jnrD;
2109
2110             /* load j atom coordinates */
2111             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2112                                               x+j_coord_offsetC,x+j_coord_offsetD,
2113                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2114
2115             /* Calculate displacement vector */
2116             dx00             = _mm_sub_ps(ix0,jx0);
2117             dy00             = _mm_sub_ps(iy0,jy0);
2118             dz00             = _mm_sub_ps(iz0,jz0);
2119             dx01             = _mm_sub_ps(ix0,jx1);
2120             dy01             = _mm_sub_ps(iy0,jy1);
2121             dz01             = _mm_sub_ps(iz0,jz1);
2122             dx02             = _mm_sub_ps(ix0,jx2);
2123             dy02             = _mm_sub_ps(iy0,jy2);
2124             dz02             = _mm_sub_ps(iz0,jz2);
2125             dx10             = _mm_sub_ps(ix1,jx0);
2126             dy10             = _mm_sub_ps(iy1,jy0);
2127             dz10             = _mm_sub_ps(iz1,jz0);
2128             dx11             = _mm_sub_ps(ix1,jx1);
2129             dy11             = _mm_sub_ps(iy1,jy1);
2130             dz11             = _mm_sub_ps(iz1,jz1);
2131             dx12             = _mm_sub_ps(ix1,jx2);
2132             dy12             = _mm_sub_ps(iy1,jy2);
2133             dz12             = _mm_sub_ps(iz1,jz2);
2134             dx20             = _mm_sub_ps(ix2,jx0);
2135             dy20             = _mm_sub_ps(iy2,jy0);
2136             dz20             = _mm_sub_ps(iz2,jz0);
2137             dx21             = _mm_sub_ps(ix2,jx1);
2138             dy21             = _mm_sub_ps(iy2,jy1);
2139             dz21             = _mm_sub_ps(iz2,jz1);
2140             dx22             = _mm_sub_ps(ix2,jx2);
2141             dy22             = _mm_sub_ps(iy2,jy2);
2142             dz22             = _mm_sub_ps(iz2,jz2);
2143
2144             /* Calculate squared distance and things based on it */
2145             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
2146             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
2147             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
2148             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
2149             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
2150             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
2151             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
2152             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
2153             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2154
2155             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2156             rinv01           = gmx_mm_invsqrt_ps(rsq01);
2157             rinv02           = gmx_mm_invsqrt_ps(rsq02);
2158             rinv10           = gmx_mm_invsqrt_ps(rsq10);
2159             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2160             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2161             rinv20           = gmx_mm_invsqrt_ps(rsq20);
2162             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2163             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2164
2165             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
2166             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
2167             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
2168             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
2169             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
2170             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
2171             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
2172             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
2173             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
2174
2175             fjx0             = _mm_setzero_ps();
2176             fjy0             = _mm_setzero_ps();
2177             fjz0             = _mm_setzero_ps();
2178             fjx1             = _mm_setzero_ps();
2179             fjy1             = _mm_setzero_ps();
2180             fjz1             = _mm_setzero_ps();
2181             fjx2             = _mm_setzero_ps();
2182             fjy2             = _mm_setzero_ps();
2183             fjz2             = _mm_setzero_ps();
2184
2185             /**************************
2186              * CALCULATE INTERACTIONS *
2187              **************************/
2188
2189             if (gmx_mm_any_lt(rsq00,rcutoff2))
2190             {
2191
2192             r00              = _mm_mul_ps(rsq00,rinv00);
2193             r00              = _mm_andnot_ps(dummy_mask,r00);
2194
2195             /* EWALD ELECTROSTATICS */
2196
2197             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2198             ewrt             = _mm_mul_ps(r00,ewtabscale);
2199             ewitab           = _mm_cvttps_epi32(ewrt);
2200             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2201             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2202                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2203                                          &ewtabF,&ewtabFn);
2204             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2205             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2206
2207             /* LENNARD-JONES DISPERSION/REPULSION */
2208
2209             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2210             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
2211
2212             cutoff_mask      = _mm_cmplt_ps(rsq00,rcutoff2);
2213
2214             fscal            = _mm_add_ps(felec,fvdw);
2215
2216             fscal            = _mm_and_ps(fscal,cutoff_mask);
2217
2218             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2219
2220             /* Calculate temporary vectorial force */
2221             tx               = _mm_mul_ps(fscal,dx00);
2222             ty               = _mm_mul_ps(fscal,dy00);
2223             tz               = _mm_mul_ps(fscal,dz00);
2224
2225             /* Update vectorial force */
2226             fix0             = _mm_add_ps(fix0,tx);
2227             fiy0             = _mm_add_ps(fiy0,ty);
2228             fiz0             = _mm_add_ps(fiz0,tz);
2229
2230             fjx0             = _mm_add_ps(fjx0,tx);
2231             fjy0             = _mm_add_ps(fjy0,ty);
2232             fjz0             = _mm_add_ps(fjz0,tz);
2233             
2234             }
2235
2236             /**************************
2237              * CALCULATE INTERACTIONS *
2238              **************************/
2239
2240             if (gmx_mm_any_lt(rsq01,rcutoff2))
2241             {
2242
2243             r01              = _mm_mul_ps(rsq01,rinv01);
2244             r01              = _mm_andnot_ps(dummy_mask,r01);
2245
2246             /* EWALD ELECTROSTATICS */
2247
2248             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2249             ewrt             = _mm_mul_ps(r01,ewtabscale);
2250             ewitab           = _mm_cvttps_epi32(ewrt);
2251             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2252             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2253                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2254                                          &ewtabF,&ewtabFn);
2255             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2256             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2257
2258             cutoff_mask      = _mm_cmplt_ps(rsq01,rcutoff2);
2259
2260             fscal            = felec;
2261
2262             fscal            = _mm_and_ps(fscal,cutoff_mask);
2263
2264             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2265
2266             /* Calculate temporary vectorial force */
2267             tx               = _mm_mul_ps(fscal,dx01);
2268             ty               = _mm_mul_ps(fscal,dy01);
2269             tz               = _mm_mul_ps(fscal,dz01);
2270
2271             /* Update vectorial force */
2272             fix0             = _mm_add_ps(fix0,tx);
2273             fiy0             = _mm_add_ps(fiy0,ty);
2274             fiz0             = _mm_add_ps(fiz0,tz);
2275
2276             fjx1             = _mm_add_ps(fjx1,tx);
2277             fjy1             = _mm_add_ps(fjy1,ty);
2278             fjz1             = _mm_add_ps(fjz1,tz);
2279             
2280             }
2281
2282             /**************************
2283              * CALCULATE INTERACTIONS *
2284              **************************/
2285
2286             if (gmx_mm_any_lt(rsq02,rcutoff2))
2287             {
2288
2289             r02              = _mm_mul_ps(rsq02,rinv02);
2290             r02              = _mm_andnot_ps(dummy_mask,r02);
2291
2292             /* EWALD ELECTROSTATICS */
2293
2294             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2295             ewrt             = _mm_mul_ps(r02,ewtabscale);
2296             ewitab           = _mm_cvttps_epi32(ewrt);
2297             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2298             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2299                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2300                                          &ewtabF,&ewtabFn);
2301             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2302             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2303
2304             cutoff_mask      = _mm_cmplt_ps(rsq02,rcutoff2);
2305
2306             fscal            = felec;
2307
2308             fscal            = _mm_and_ps(fscal,cutoff_mask);
2309
2310             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2311
2312             /* Calculate temporary vectorial force */
2313             tx               = _mm_mul_ps(fscal,dx02);
2314             ty               = _mm_mul_ps(fscal,dy02);
2315             tz               = _mm_mul_ps(fscal,dz02);
2316
2317             /* Update vectorial force */
2318             fix0             = _mm_add_ps(fix0,tx);
2319             fiy0             = _mm_add_ps(fiy0,ty);
2320             fiz0             = _mm_add_ps(fiz0,tz);
2321
2322             fjx2             = _mm_add_ps(fjx2,tx);
2323             fjy2             = _mm_add_ps(fjy2,ty);
2324             fjz2             = _mm_add_ps(fjz2,tz);
2325             
2326             }
2327
2328             /**************************
2329              * CALCULATE INTERACTIONS *
2330              **************************/
2331
2332             if (gmx_mm_any_lt(rsq10,rcutoff2))
2333             {
2334
2335             r10              = _mm_mul_ps(rsq10,rinv10);
2336             r10              = _mm_andnot_ps(dummy_mask,r10);
2337
2338             /* EWALD ELECTROSTATICS */
2339
2340             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2341             ewrt             = _mm_mul_ps(r10,ewtabscale);
2342             ewitab           = _mm_cvttps_epi32(ewrt);
2343             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2344             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2345                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2346                                          &ewtabF,&ewtabFn);
2347             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2348             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2349
2350             cutoff_mask      = _mm_cmplt_ps(rsq10,rcutoff2);
2351
2352             fscal            = felec;
2353
2354             fscal            = _mm_and_ps(fscal,cutoff_mask);
2355
2356             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2357
2358             /* Calculate temporary vectorial force */
2359             tx               = _mm_mul_ps(fscal,dx10);
2360             ty               = _mm_mul_ps(fscal,dy10);
2361             tz               = _mm_mul_ps(fscal,dz10);
2362
2363             /* Update vectorial force */
2364             fix1             = _mm_add_ps(fix1,tx);
2365             fiy1             = _mm_add_ps(fiy1,ty);
2366             fiz1             = _mm_add_ps(fiz1,tz);
2367
2368             fjx0             = _mm_add_ps(fjx0,tx);
2369             fjy0             = _mm_add_ps(fjy0,ty);
2370             fjz0             = _mm_add_ps(fjz0,tz);
2371             
2372             }
2373
2374             /**************************
2375              * CALCULATE INTERACTIONS *
2376              **************************/
2377
2378             if (gmx_mm_any_lt(rsq11,rcutoff2))
2379             {
2380
2381             r11              = _mm_mul_ps(rsq11,rinv11);
2382             r11              = _mm_andnot_ps(dummy_mask,r11);
2383
2384             /* EWALD ELECTROSTATICS */
2385
2386             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2387             ewrt             = _mm_mul_ps(r11,ewtabscale);
2388             ewitab           = _mm_cvttps_epi32(ewrt);
2389             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2390             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2391                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2392                                          &ewtabF,&ewtabFn);
2393             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2394             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2395
2396             cutoff_mask      = _mm_cmplt_ps(rsq11,rcutoff2);
2397
2398             fscal            = felec;
2399
2400             fscal            = _mm_and_ps(fscal,cutoff_mask);
2401
2402             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2403
2404             /* Calculate temporary vectorial force */
2405             tx               = _mm_mul_ps(fscal,dx11);
2406             ty               = _mm_mul_ps(fscal,dy11);
2407             tz               = _mm_mul_ps(fscal,dz11);
2408
2409             /* Update vectorial force */
2410             fix1             = _mm_add_ps(fix1,tx);
2411             fiy1             = _mm_add_ps(fiy1,ty);
2412             fiz1             = _mm_add_ps(fiz1,tz);
2413
2414             fjx1             = _mm_add_ps(fjx1,tx);
2415             fjy1             = _mm_add_ps(fjy1,ty);
2416             fjz1             = _mm_add_ps(fjz1,tz);
2417             
2418             }
2419
2420             /**************************
2421              * CALCULATE INTERACTIONS *
2422              **************************/
2423
2424             if (gmx_mm_any_lt(rsq12,rcutoff2))
2425             {
2426
2427             r12              = _mm_mul_ps(rsq12,rinv12);
2428             r12              = _mm_andnot_ps(dummy_mask,r12);
2429
2430             /* EWALD ELECTROSTATICS */
2431
2432             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2433             ewrt             = _mm_mul_ps(r12,ewtabscale);
2434             ewitab           = _mm_cvttps_epi32(ewrt);
2435             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2436             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2437                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2438                                          &ewtabF,&ewtabFn);
2439             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2440             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2441
2442             cutoff_mask      = _mm_cmplt_ps(rsq12,rcutoff2);
2443
2444             fscal            = felec;
2445
2446             fscal            = _mm_and_ps(fscal,cutoff_mask);
2447
2448             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2449
2450             /* Calculate temporary vectorial force */
2451             tx               = _mm_mul_ps(fscal,dx12);
2452             ty               = _mm_mul_ps(fscal,dy12);
2453             tz               = _mm_mul_ps(fscal,dz12);
2454
2455             /* Update vectorial force */
2456             fix1             = _mm_add_ps(fix1,tx);
2457             fiy1             = _mm_add_ps(fiy1,ty);
2458             fiz1             = _mm_add_ps(fiz1,tz);
2459
2460             fjx2             = _mm_add_ps(fjx2,tx);
2461             fjy2             = _mm_add_ps(fjy2,ty);
2462             fjz2             = _mm_add_ps(fjz2,tz);
2463             
2464             }
2465
2466             /**************************
2467              * CALCULATE INTERACTIONS *
2468              **************************/
2469
2470             if (gmx_mm_any_lt(rsq20,rcutoff2))
2471             {
2472
2473             r20              = _mm_mul_ps(rsq20,rinv20);
2474             r20              = _mm_andnot_ps(dummy_mask,r20);
2475
2476             /* EWALD ELECTROSTATICS */
2477
2478             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2479             ewrt             = _mm_mul_ps(r20,ewtabscale);
2480             ewitab           = _mm_cvttps_epi32(ewrt);
2481             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2482             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2483                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2484                                          &ewtabF,&ewtabFn);
2485             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2486             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2487
2488             cutoff_mask      = _mm_cmplt_ps(rsq20,rcutoff2);
2489
2490             fscal            = felec;
2491
2492             fscal            = _mm_and_ps(fscal,cutoff_mask);
2493
2494             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2495
2496             /* Calculate temporary vectorial force */
2497             tx               = _mm_mul_ps(fscal,dx20);
2498             ty               = _mm_mul_ps(fscal,dy20);
2499             tz               = _mm_mul_ps(fscal,dz20);
2500
2501             /* Update vectorial force */
2502             fix2             = _mm_add_ps(fix2,tx);
2503             fiy2             = _mm_add_ps(fiy2,ty);
2504             fiz2             = _mm_add_ps(fiz2,tz);
2505
2506             fjx0             = _mm_add_ps(fjx0,tx);
2507             fjy0             = _mm_add_ps(fjy0,ty);
2508             fjz0             = _mm_add_ps(fjz0,tz);
2509             
2510             }
2511
2512             /**************************
2513              * CALCULATE INTERACTIONS *
2514              **************************/
2515
2516             if (gmx_mm_any_lt(rsq21,rcutoff2))
2517             {
2518
2519             r21              = _mm_mul_ps(rsq21,rinv21);
2520             r21              = _mm_andnot_ps(dummy_mask,r21);
2521
2522             /* EWALD ELECTROSTATICS */
2523
2524             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2525             ewrt             = _mm_mul_ps(r21,ewtabscale);
2526             ewitab           = _mm_cvttps_epi32(ewrt);
2527             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2528             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2529                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2530                                          &ewtabF,&ewtabFn);
2531             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2532             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2533
2534             cutoff_mask      = _mm_cmplt_ps(rsq21,rcutoff2);
2535
2536             fscal            = felec;
2537
2538             fscal            = _mm_and_ps(fscal,cutoff_mask);
2539
2540             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2541
2542             /* Calculate temporary vectorial force */
2543             tx               = _mm_mul_ps(fscal,dx21);
2544             ty               = _mm_mul_ps(fscal,dy21);
2545             tz               = _mm_mul_ps(fscal,dz21);
2546
2547             /* Update vectorial force */
2548             fix2             = _mm_add_ps(fix2,tx);
2549             fiy2             = _mm_add_ps(fiy2,ty);
2550             fiz2             = _mm_add_ps(fiz2,tz);
2551
2552             fjx1             = _mm_add_ps(fjx1,tx);
2553             fjy1             = _mm_add_ps(fjy1,ty);
2554             fjz1             = _mm_add_ps(fjz1,tz);
2555             
2556             }
2557
2558             /**************************
2559              * CALCULATE INTERACTIONS *
2560              **************************/
2561
2562             if (gmx_mm_any_lt(rsq22,rcutoff2))
2563             {
2564
2565             r22              = _mm_mul_ps(rsq22,rinv22);
2566             r22              = _mm_andnot_ps(dummy_mask,r22);
2567
2568             /* EWALD ELECTROSTATICS */
2569
2570             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2571             ewrt             = _mm_mul_ps(r22,ewtabscale);
2572             ewitab           = _mm_cvttps_epi32(ewrt);
2573             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2574             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2575                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2576                                          &ewtabF,&ewtabFn);
2577             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2578             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2579
2580             cutoff_mask      = _mm_cmplt_ps(rsq22,rcutoff2);
2581
2582             fscal            = felec;
2583
2584             fscal            = _mm_and_ps(fscal,cutoff_mask);
2585
2586             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2587
2588             /* Calculate temporary vectorial force */
2589             tx               = _mm_mul_ps(fscal,dx22);
2590             ty               = _mm_mul_ps(fscal,dy22);
2591             tz               = _mm_mul_ps(fscal,dz22);
2592
2593             /* Update vectorial force */
2594             fix2             = _mm_add_ps(fix2,tx);
2595             fiy2             = _mm_add_ps(fiy2,ty);
2596             fiz2             = _mm_add_ps(fiz2,tz);
2597
2598             fjx2             = _mm_add_ps(fjx2,tx);
2599             fjy2             = _mm_add_ps(fjy2,ty);
2600             fjz2             = _mm_add_ps(fjz2,tz);
2601             
2602             }
2603
2604             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2605             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2606             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2607             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2608
2609             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2610                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2611
2612             /* Inner loop uses 367 flops */
2613         }
2614
2615         /* End of innermost loop */
2616
2617         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2618                                               f+i_coord_offset,fshift+i_shift_offset);
2619
2620         /* Increment number of inner iterations */
2621         inneriter                  += j_index_end - j_index_start;
2622
2623         /* Outer loop uses 18 flops */
2624     }
2625
2626     /* Increment number of outer iterations */
2627     outeriter        += nri;
2628
2629     /* Update outer/inner flops */
2630
2631     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*367);
2632 }