Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecEw_VdwCSTab_GeomW3W3_sse2_single.c
1 /*
2  * Note: this file was generated by the Gromacs sse2_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_sse2_single.h"
34 #include "kernelutil_x86_sse2_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            CubicSplineTable
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_VF_sse2_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
63     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
64     real             rcutoff_scalar;
65     real             *shiftvec,*fshift,*x,*f;
66     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
67     real             scratch[4*DIM];
68     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
69     int              vdwioffset0;
70     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
71     int              vdwioffset1;
72     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
73     int              vdwioffset2;
74     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
75     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
76     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
77     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
78     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
79     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
80     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
81     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
82     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
83     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
84     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
85     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
86     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
87     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
88     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
89     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
90     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
91     real             *charge;
92     int              nvdwtype;
93     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
94     int              *vdwtype;
95     real             *vdwparam;
96     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
97     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
98     __m128i          vfitab;
99     __m128i          ifour       = _mm_set1_epi32(4);
100     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
101     real             *vftab;
102     __m128i          ewitab;
103     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
104     real             *ewtab;
105     __m128           dummy_mask,cutoff_mask;
106     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
107     __m128           one     = _mm_set1_ps(1.0);
108     __m128           two     = _mm_set1_ps(2.0);
109     x                = xx[0];
110     f                = ff[0];
111
112     nri              = nlist->nri;
113     iinr             = nlist->iinr;
114     jindex           = nlist->jindex;
115     jjnr             = nlist->jjnr;
116     shiftidx         = nlist->shift;
117     gid              = nlist->gid;
118     shiftvec         = fr->shift_vec[0];
119     fshift           = fr->fshift[0];
120     facel            = _mm_set1_ps(fr->epsfac);
121     charge           = mdatoms->chargeA;
122     nvdwtype         = fr->ntype;
123     vdwparam         = fr->nbfp;
124     vdwtype          = mdatoms->typeA;
125
126     vftab            = kernel_data->table_vdw->data;
127     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
128
129     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
130     ewtab            = fr->ic->tabq_coul_FDV0;
131     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
132     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
133
134     /* Setup water-specific parameters */
135     inr              = nlist->iinr[0];
136     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
137     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
138     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
139     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
140
141     jq0              = _mm_set1_ps(charge[inr+0]);
142     jq1              = _mm_set1_ps(charge[inr+1]);
143     jq2              = _mm_set1_ps(charge[inr+2]);
144     vdwjidx0A        = 2*vdwtype[inr+0];
145     qq00             = _mm_mul_ps(iq0,jq0);
146     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
147     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
148     qq01             = _mm_mul_ps(iq0,jq1);
149     qq02             = _mm_mul_ps(iq0,jq2);
150     qq10             = _mm_mul_ps(iq1,jq0);
151     qq11             = _mm_mul_ps(iq1,jq1);
152     qq12             = _mm_mul_ps(iq1,jq2);
153     qq20             = _mm_mul_ps(iq2,jq0);
154     qq21             = _mm_mul_ps(iq2,jq1);
155     qq22             = _mm_mul_ps(iq2,jq2);
156
157     /* Avoid stupid compiler warnings */
158     jnrA = jnrB = jnrC = jnrD = 0;
159     j_coord_offsetA = 0;
160     j_coord_offsetB = 0;
161     j_coord_offsetC = 0;
162     j_coord_offsetD = 0;
163
164     outeriter        = 0;
165     inneriter        = 0;
166
167     for(iidx=0;iidx<4*DIM;iidx++)
168     {
169         scratch[iidx] = 0.0;
170     }  
171
172     /* Start outer loop over neighborlists */
173     for(iidx=0; iidx<nri; iidx++)
174     {
175         /* Load shift vector for this list */
176         i_shift_offset   = DIM*shiftidx[iidx];
177
178         /* Load limits for loop over neighbors */
179         j_index_start    = jindex[iidx];
180         j_index_end      = jindex[iidx+1];
181
182         /* Get outer coordinate index */
183         inr              = iinr[iidx];
184         i_coord_offset   = DIM*inr;
185
186         /* Load i particle coords and add shift vector */
187         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
188                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
189         
190         fix0             = _mm_setzero_ps();
191         fiy0             = _mm_setzero_ps();
192         fiz0             = _mm_setzero_ps();
193         fix1             = _mm_setzero_ps();
194         fiy1             = _mm_setzero_ps();
195         fiz1             = _mm_setzero_ps();
196         fix2             = _mm_setzero_ps();
197         fiy2             = _mm_setzero_ps();
198         fiz2             = _mm_setzero_ps();
199
200         /* Reset potential sums */
201         velecsum         = _mm_setzero_ps();
202         vvdwsum          = _mm_setzero_ps();
203
204         /* Start inner kernel loop */
205         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
206         {
207
208             /* Get j neighbor index, and coordinate index */
209             jnrA             = jjnr[jidx];
210             jnrB             = jjnr[jidx+1];
211             jnrC             = jjnr[jidx+2];
212             jnrD             = jjnr[jidx+3];
213             j_coord_offsetA  = DIM*jnrA;
214             j_coord_offsetB  = DIM*jnrB;
215             j_coord_offsetC  = DIM*jnrC;
216             j_coord_offsetD  = DIM*jnrD;
217
218             /* load j atom coordinates */
219             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
220                                               x+j_coord_offsetC,x+j_coord_offsetD,
221                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
222
223             /* Calculate displacement vector */
224             dx00             = _mm_sub_ps(ix0,jx0);
225             dy00             = _mm_sub_ps(iy0,jy0);
226             dz00             = _mm_sub_ps(iz0,jz0);
227             dx01             = _mm_sub_ps(ix0,jx1);
228             dy01             = _mm_sub_ps(iy0,jy1);
229             dz01             = _mm_sub_ps(iz0,jz1);
230             dx02             = _mm_sub_ps(ix0,jx2);
231             dy02             = _mm_sub_ps(iy0,jy2);
232             dz02             = _mm_sub_ps(iz0,jz2);
233             dx10             = _mm_sub_ps(ix1,jx0);
234             dy10             = _mm_sub_ps(iy1,jy0);
235             dz10             = _mm_sub_ps(iz1,jz0);
236             dx11             = _mm_sub_ps(ix1,jx1);
237             dy11             = _mm_sub_ps(iy1,jy1);
238             dz11             = _mm_sub_ps(iz1,jz1);
239             dx12             = _mm_sub_ps(ix1,jx2);
240             dy12             = _mm_sub_ps(iy1,jy2);
241             dz12             = _mm_sub_ps(iz1,jz2);
242             dx20             = _mm_sub_ps(ix2,jx0);
243             dy20             = _mm_sub_ps(iy2,jy0);
244             dz20             = _mm_sub_ps(iz2,jz0);
245             dx21             = _mm_sub_ps(ix2,jx1);
246             dy21             = _mm_sub_ps(iy2,jy1);
247             dz21             = _mm_sub_ps(iz2,jz1);
248             dx22             = _mm_sub_ps(ix2,jx2);
249             dy22             = _mm_sub_ps(iy2,jy2);
250             dz22             = _mm_sub_ps(iz2,jz2);
251
252             /* Calculate squared distance and things based on it */
253             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
254             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
255             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
256             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
257             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
258             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
259             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
260             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
261             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
262
263             rinv00           = gmx_mm_invsqrt_ps(rsq00);
264             rinv01           = gmx_mm_invsqrt_ps(rsq01);
265             rinv02           = gmx_mm_invsqrt_ps(rsq02);
266             rinv10           = gmx_mm_invsqrt_ps(rsq10);
267             rinv11           = gmx_mm_invsqrt_ps(rsq11);
268             rinv12           = gmx_mm_invsqrt_ps(rsq12);
269             rinv20           = gmx_mm_invsqrt_ps(rsq20);
270             rinv21           = gmx_mm_invsqrt_ps(rsq21);
271             rinv22           = gmx_mm_invsqrt_ps(rsq22);
272
273             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
274             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
275             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
276             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
277             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
278             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
279             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
280             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
281             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
282
283             fjx0             = _mm_setzero_ps();
284             fjy0             = _mm_setzero_ps();
285             fjz0             = _mm_setzero_ps();
286             fjx1             = _mm_setzero_ps();
287             fjy1             = _mm_setzero_ps();
288             fjz1             = _mm_setzero_ps();
289             fjx2             = _mm_setzero_ps();
290             fjy2             = _mm_setzero_ps();
291             fjz2             = _mm_setzero_ps();
292
293             /**************************
294              * CALCULATE INTERACTIONS *
295              **************************/
296
297             r00              = _mm_mul_ps(rsq00,rinv00);
298
299             /* Calculate table index by multiplying r with table scale and truncate to integer */
300             rt               = _mm_mul_ps(r00,vftabscale);
301             vfitab           = _mm_cvttps_epi32(rt);
302             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
303             vfitab           = _mm_slli_epi32(vfitab,3);
304
305             /* EWALD ELECTROSTATICS */
306
307             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
308             ewrt             = _mm_mul_ps(r00,ewtabscale);
309             ewitab           = _mm_cvttps_epi32(ewrt);
310             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
311             ewitab           = _mm_slli_epi32(ewitab,2);
312             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
313             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
314             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
315             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
316             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
317             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
318             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
319             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
320             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
321
322             /* CUBIC SPLINE TABLE DISPERSION */
323             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
324             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
325             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
326             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
327             _MM_TRANSPOSE4_PS(Y,F,G,H);
328             Heps             = _mm_mul_ps(vfeps,H);
329             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
330             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
331             vvdw6            = _mm_mul_ps(c6_00,VV);
332             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
333             fvdw6            = _mm_mul_ps(c6_00,FF);
334
335             /* CUBIC SPLINE TABLE REPULSION */
336             vfitab           = _mm_add_epi32(vfitab,ifour);
337             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
338             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
339             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
340             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
341             _MM_TRANSPOSE4_PS(Y,F,G,H);
342             Heps             = _mm_mul_ps(vfeps,H);
343             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
344             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
345             vvdw12           = _mm_mul_ps(c12_00,VV);
346             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
347             fvdw12           = _mm_mul_ps(c12_00,FF);
348             vvdw             = _mm_add_ps(vvdw12,vvdw6);
349             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
350
351             /* Update potential sum for this i atom from the interaction with this j atom. */
352             velecsum         = _mm_add_ps(velecsum,velec);
353             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
354
355             fscal            = _mm_add_ps(felec,fvdw);
356
357             /* Calculate temporary vectorial force */
358             tx               = _mm_mul_ps(fscal,dx00);
359             ty               = _mm_mul_ps(fscal,dy00);
360             tz               = _mm_mul_ps(fscal,dz00);
361
362             /* Update vectorial force */
363             fix0             = _mm_add_ps(fix0,tx);
364             fiy0             = _mm_add_ps(fiy0,ty);
365             fiz0             = _mm_add_ps(fiz0,tz);
366
367             fjx0             = _mm_add_ps(fjx0,tx);
368             fjy0             = _mm_add_ps(fjy0,ty);
369             fjz0             = _mm_add_ps(fjz0,tz);
370             
371             /**************************
372              * CALCULATE INTERACTIONS *
373              **************************/
374
375             r01              = _mm_mul_ps(rsq01,rinv01);
376
377             /* EWALD ELECTROSTATICS */
378
379             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
380             ewrt             = _mm_mul_ps(r01,ewtabscale);
381             ewitab           = _mm_cvttps_epi32(ewrt);
382             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
383             ewitab           = _mm_slli_epi32(ewitab,2);
384             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
385             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
386             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
387             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
388             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
389             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
390             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
391             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
392             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
393
394             /* Update potential sum for this i atom from the interaction with this j atom. */
395             velecsum         = _mm_add_ps(velecsum,velec);
396
397             fscal            = felec;
398
399             /* Calculate temporary vectorial force */
400             tx               = _mm_mul_ps(fscal,dx01);
401             ty               = _mm_mul_ps(fscal,dy01);
402             tz               = _mm_mul_ps(fscal,dz01);
403
404             /* Update vectorial force */
405             fix0             = _mm_add_ps(fix0,tx);
406             fiy0             = _mm_add_ps(fiy0,ty);
407             fiz0             = _mm_add_ps(fiz0,tz);
408
409             fjx1             = _mm_add_ps(fjx1,tx);
410             fjy1             = _mm_add_ps(fjy1,ty);
411             fjz1             = _mm_add_ps(fjz1,tz);
412             
413             /**************************
414              * CALCULATE INTERACTIONS *
415              **************************/
416
417             r02              = _mm_mul_ps(rsq02,rinv02);
418
419             /* EWALD ELECTROSTATICS */
420
421             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
422             ewrt             = _mm_mul_ps(r02,ewtabscale);
423             ewitab           = _mm_cvttps_epi32(ewrt);
424             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
425             ewitab           = _mm_slli_epi32(ewitab,2);
426             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
427             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
428             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
429             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
430             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
431             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
432             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
433             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
434             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
435
436             /* Update potential sum for this i atom from the interaction with this j atom. */
437             velecsum         = _mm_add_ps(velecsum,velec);
438
439             fscal            = felec;
440
441             /* Calculate temporary vectorial force */
442             tx               = _mm_mul_ps(fscal,dx02);
443             ty               = _mm_mul_ps(fscal,dy02);
444             tz               = _mm_mul_ps(fscal,dz02);
445
446             /* Update vectorial force */
447             fix0             = _mm_add_ps(fix0,tx);
448             fiy0             = _mm_add_ps(fiy0,ty);
449             fiz0             = _mm_add_ps(fiz0,tz);
450
451             fjx2             = _mm_add_ps(fjx2,tx);
452             fjy2             = _mm_add_ps(fjy2,ty);
453             fjz2             = _mm_add_ps(fjz2,tz);
454             
455             /**************************
456              * CALCULATE INTERACTIONS *
457              **************************/
458
459             r10              = _mm_mul_ps(rsq10,rinv10);
460
461             /* EWALD ELECTROSTATICS */
462
463             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
464             ewrt             = _mm_mul_ps(r10,ewtabscale);
465             ewitab           = _mm_cvttps_epi32(ewrt);
466             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
467             ewitab           = _mm_slli_epi32(ewitab,2);
468             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
469             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
470             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
471             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
472             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
473             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
474             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
475             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
476             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
477
478             /* Update potential sum for this i atom from the interaction with this j atom. */
479             velecsum         = _mm_add_ps(velecsum,velec);
480
481             fscal            = felec;
482
483             /* Calculate temporary vectorial force */
484             tx               = _mm_mul_ps(fscal,dx10);
485             ty               = _mm_mul_ps(fscal,dy10);
486             tz               = _mm_mul_ps(fscal,dz10);
487
488             /* Update vectorial force */
489             fix1             = _mm_add_ps(fix1,tx);
490             fiy1             = _mm_add_ps(fiy1,ty);
491             fiz1             = _mm_add_ps(fiz1,tz);
492
493             fjx0             = _mm_add_ps(fjx0,tx);
494             fjy0             = _mm_add_ps(fjy0,ty);
495             fjz0             = _mm_add_ps(fjz0,tz);
496             
497             /**************************
498              * CALCULATE INTERACTIONS *
499              **************************/
500
501             r11              = _mm_mul_ps(rsq11,rinv11);
502
503             /* EWALD ELECTROSTATICS */
504
505             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
506             ewrt             = _mm_mul_ps(r11,ewtabscale);
507             ewitab           = _mm_cvttps_epi32(ewrt);
508             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
509             ewitab           = _mm_slli_epi32(ewitab,2);
510             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
511             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
512             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
513             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
514             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
515             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
516             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
517             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
518             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
519
520             /* Update potential sum for this i atom from the interaction with this j atom. */
521             velecsum         = _mm_add_ps(velecsum,velec);
522
523             fscal            = felec;
524
525             /* Calculate temporary vectorial force */
526             tx               = _mm_mul_ps(fscal,dx11);
527             ty               = _mm_mul_ps(fscal,dy11);
528             tz               = _mm_mul_ps(fscal,dz11);
529
530             /* Update vectorial force */
531             fix1             = _mm_add_ps(fix1,tx);
532             fiy1             = _mm_add_ps(fiy1,ty);
533             fiz1             = _mm_add_ps(fiz1,tz);
534
535             fjx1             = _mm_add_ps(fjx1,tx);
536             fjy1             = _mm_add_ps(fjy1,ty);
537             fjz1             = _mm_add_ps(fjz1,tz);
538             
539             /**************************
540              * CALCULATE INTERACTIONS *
541              **************************/
542
543             r12              = _mm_mul_ps(rsq12,rinv12);
544
545             /* EWALD ELECTROSTATICS */
546
547             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
548             ewrt             = _mm_mul_ps(r12,ewtabscale);
549             ewitab           = _mm_cvttps_epi32(ewrt);
550             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
551             ewitab           = _mm_slli_epi32(ewitab,2);
552             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
553             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
554             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
555             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
556             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
557             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
558             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
559             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
560             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
561
562             /* Update potential sum for this i atom from the interaction with this j atom. */
563             velecsum         = _mm_add_ps(velecsum,velec);
564
565             fscal            = felec;
566
567             /* Calculate temporary vectorial force */
568             tx               = _mm_mul_ps(fscal,dx12);
569             ty               = _mm_mul_ps(fscal,dy12);
570             tz               = _mm_mul_ps(fscal,dz12);
571
572             /* Update vectorial force */
573             fix1             = _mm_add_ps(fix1,tx);
574             fiy1             = _mm_add_ps(fiy1,ty);
575             fiz1             = _mm_add_ps(fiz1,tz);
576
577             fjx2             = _mm_add_ps(fjx2,tx);
578             fjy2             = _mm_add_ps(fjy2,ty);
579             fjz2             = _mm_add_ps(fjz2,tz);
580             
581             /**************************
582              * CALCULATE INTERACTIONS *
583              **************************/
584
585             r20              = _mm_mul_ps(rsq20,rinv20);
586
587             /* EWALD ELECTROSTATICS */
588
589             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
590             ewrt             = _mm_mul_ps(r20,ewtabscale);
591             ewitab           = _mm_cvttps_epi32(ewrt);
592             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
593             ewitab           = _mm_slli_epi32(ewitab,2);
594             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
595             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
596             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
597             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
598             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
599             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
600             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
601             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
602             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
603
604             /* Update potential sum for this i atom from the interaction with this j atom. */
605             velecsum         = _mm_add_ps(velecsum,velec);
606
607             fscal            = felec;
608
609             /* Calculate temporary vectorial force */
610             tx               = _mm_mul_ps(fscal,dx20);
611             ty               = _mm_mul_ps(fscal,dy20);
612             tz               = _mm_mul_ps(fscal,dz20);
613
614             /* Update vectorial force */
615             fix2             = _mm_add_ps(fix2,tx);
616             fiy2             = _mm_add_ps(fiy2,ty);
617             fiz2             = _mm_add_ps(fiz2,tz);
618
619             fjx0             = _mm_add_ps(fjx0,tx);
620             fjy0             = _mm_add_ps(fjy0,ty);
621             fjz0             = _mm_add_ps(fjz0,tz);
622             
623             /**************************
624              * CALCULATE INTERACTIONS *
625              **************************/
626
627             r21              = _mm_mul_ps(rsq21,rinv21);
628
629             /* EWALD ELECTROSTATICS */
630
631             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
632             ewrt             = _mm_mul_ps(r21,ewtabscale);
633             ewitab           = _mm_cvttps_epi32(ewrt);
634             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
635             ewitab           = _mm_slli_epi32(ewitab,2);
636             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
637             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
638             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
639             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
640             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
641             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
642             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
643             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
644             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
645
646             /* Update potential sum for this i atom from the interaction with this j atom. */
647             velecsum         = _mm_add_ps(velecsum,velec);
648
649             fscal            = felec;
650
651             /* Calculate temporary vectorial force */
652             tx               = _mm_mul_ps(fscal,dx21);
653             ty               = _mm_mul_ps(fscal,dy21);
654             tz               = _mm_mul_ps(fscal,dz21);
655
656             /* Update vectorial force */
657             fix2             = _mm_add_ps(fix2,tx);
658             fiy2             = _mm_add_ps(fiy2,ty);
659             fiz2             = _mm_add_ps(fiz2,tz);
660
661             fjx1             = _mm_add_ps(fjx1,tx);
662             fjy1             = _mm_add_ps(fjy1,ty);
663             fjz1             = _mm_add_ps(fjz1,tz);
664             
665             /**************************
666              * CALCULATE INTERACTIONS *
667              **************************/
668
669             r22              = _mm_mul_ps(rsq22,rinv22);
670
671             /* EWALD ELECTROSTATICS */
672
673             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
674             ewrt             = _mm_mul_ps(r22,ewtabscale);
675             ewitab           = _mm_cvttps_epi32(ewrt);
676             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
677             ewitab           = _mm_slli_epi32(ewitab,2);
678             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
679             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
680             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
681             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
682             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
683             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
684             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
685             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
686             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
687
688             /* Update potential sum for this i atom from the interaction with this j atom. */
689             velecsum         = _mm_add_ps(velecsum,velec);
690
691             fscal            = felec;
692
693             /* Calculate temporary vectorial force */
694             tx               = _mm_mul_ps(fscal,dx22);
695             ty               = _mm_mul_ps(fscal,dy22);
696             tz               = _mm_mul_ps(fscal,dz22);
697
698             /* Update vectorial force */
699             fix2             = _mm_add_ps(fix2,tx);
700             fiy2             = _mm_add_ps(fiy2,ty);
701             fiz2             = _mm_add_ps(fiz2,tz);
702
703             fjx2             = _mm_add_ps(fjx2,tx);
704             fjy2             = _mm_add_ps(fjy2,ty);
705             fjz2             = _mm_add_ps(fjz2,tz);
706             
707             fjptrA             = f+j_coord_offsetA;
708             fjptrB             = f+j_coord_offsetB;
709             fjptrC             = f+j_coord_offsetC;
710             fjptrD             = f+j_coord_offsetD;
711
712             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
713                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
714
715             /* Inner loop uses 403 flops */
716         }
717
718         if(jidx<j_index_end)
719         {
720
721             /* Get j neighbor index, and coordinate index */
722             jnrlistA         = jjnr[jidx];
723             jnrlistB         = jjnr[jidx+1];
724             jnrlistC         = jjnr[jidx+2];
725             jnrlistD         = jjnr[jidx+3];
726             /* Sign of each element will be negative for non-real atoms.
727              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
728              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
729              */
730             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
731             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
732             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
733             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
734             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
735             j_coord_offsetA  = DIM*jnrA;
736             j_coord_offsetB  = DIM*jnrB;
737             j_coord_offsetC  = DIM*jnrC;
738             j_coord_offsetD  = DIM*jnrD;
739
740             /* load j atom coordinates */
741             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
742                                               x+j_coord_offsetC,x+j_coord_offsetD,
743                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
744
745             /* Calculate displacement vector */
746             dx00             = _mm_sub_ps(ix0,jx0);
747             dy00             = _mm_sub_ps(iy0,jy0);
748             dz00             = _mm_sub_ps(iz0,jz0);
749             dx01             = _mm_sub_ps(ix0,jx1);
750             dy01             = _mm_sub_ps(iy0,jy1);
751             dz01             = _mm_sub_ps(iz0,jz1);
752             dx02             = _mm_sub_ps(ix0,jx2);
753             dy02             = _mm_sub_ps(iy0,jy2);
754             dz02             = _mm_sub_ps(iz0,jz2);
755             dx10             = _mm_sub_ps(ix1,jx0);
756             dy10             = _mm_sub_ps(iy1,jy0);
757             dz10             = _mm_sub_ps(iz1,jz0);
758             dx11             = _mm_sub_ps(ix1,jx1);
759             dy11             = _mm_sub_ps(iy1,jy1);
760             dz11             = _mm_sub_ps(iz1,jz1);
761             dx12             = _mm_sub_ps(ix1,jx2);
762             dy12             = _mm_sub_ps(iy1,jy2);
763             dz12             = _mm_sub_ps(iz1,jz2);
764             dx20             = _mm_sub_ps(ix2,jx0);
765             dy20             = _mm_sub_ps(iy2,jy0);
766             dz20             = _mm_sub_ps(iz2,jz0);
767             dx21             = _mm_sub_ps(ix2,jx1);
768             dy21             = _mm_sub_ps(iy2,jy1);
769             dz21             = _mm_sub_ps(iz2,jz1);
770             dx22             = _mm_sub_ps(ix2,jx2);
771             dy22             = _mm_sub_ps(iy2,jy2);
772             dz22             = _mm_sub_ps(iz2,jz2);
773
774             /* Calculate squared distance and things based on it */
775             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
776             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
777             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
778             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
779             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
780             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
781             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
782             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
783             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
784
785             rinv00           = gmx_mm_invsqrt_ps(rsq00);
786             rinv01           = gmx_mm_invsqrt_ps(rsq01);
787             rinv02           = gmx_mm_invsqrt_ps(rsq02);
788             rinv10           = gmx_mm_invsqrt_ps(rsq10);
789             rinv11           = gmx_mm_invsqrt_ps(rsq11);
790             rinv12           = gmx_mm_invsqrt_ps(rsq12);
791             rinv20           = gmx_mm_invsqrt_ps(rsq20);
792             rinv21           = gmx_mm_invsqrt_ps(rsq21);
793             rinv22           = gmx_mm_invsqrt_ps(rsq22);
794
795             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
796             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
797             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
798             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
799             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
800             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
801             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
802             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
803             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
804
805             fjx0             = _mm_setzero_ps();
806             fjy0             = _mm_setzero_ps();
807             fjz0             = _mm_setzero_ps();
808             fjx1             = _mm_setzero_ps();
809             fjy1             = _mm_setzero_ps();
810             fjz1             = _mm_setzero_ps();
811             fjx2             = _mm_setzero_ps();
812             fjy2             = _mm_setzero_ps();
813             fjz2             = _mm_setzero_ps();
814
815             /**************************
816              * CALCULATE INTERACTIONS *
817              **************************/
818
819             r00              = _mm_mul_ps(rsq00,rinv00);
820             r00              = _mm_andnot_ps(dummy_mask,r00);
821
822             /* Calculate table index by multiplying r with table scale and truncate to integer */
823             rt               = _mm_mul_ps(r00,vftabscale);
824             vfitab           = _mm_cvttps_epi32(rt);
825             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
826             vfitab           = _mm_slli_epi32(vfitab,3);
827
828             /* EWALD ELECTROSTATICS */
829
830             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
831             ewrt             = _mm_mul_ps(r00,ewtabscale);
832             ewitab           = _mm_cvttps_epi32(ewrt);
833             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
834             ewitab           = _mm_slli_epi32(ewitab,2);
835             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
836             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
837             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
838             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
839             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
840             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
841             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
842             velec            = _mm_mul_ps(qq00,_mm_sub_ps(rinv00,velec));
843             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
844
845             /* CUBIC SPLINE TABLE DISPERSION */
846             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
847             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
848             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
849             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
850             _MM_TRANSPOSE4_PS(Y,F,G,H);
851             Heps             = _mm_mul_ps(vfeps,H);
852             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
853             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
854             vvdw6            = _mm_mul_ps(c6_00,VV);
855             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
856             fvdw6            = _mm_mul_ps(c6_00,FF);
857
858             /* CUBIC SPLINE TABLE REPULSION */
859             vfitab           = _mm_add_epi32(vfitab,ifour);
860             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
861             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
862             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
863             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
864             _MM_TRANSPOSE4_PS(Y,F,G,H);
865             Heps             = _mm_mul_ps(vfeps,H);
866             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
867             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
868             vvdw12           = _mm_mul_ps(c12_00,VV);
869             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
870             fvdw12           = _mm_mul_ps(c12_00,FF);
871             vvdw             = _mm_add_ps(vvdw12,vvdw6);
872             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
873
874             /* Update potential sum for this i atom from the interaction with this j atom. */
875             velec            = _mm_andnot_ps(dummy_mask,velec);
876             velecsum         = _mm_add_ps(velecsum,velec);
877             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
878             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
879
880             fscal            = _mm_add_ps(felec,fvdw);
881
882             fscal            = _mm_andnot_ps(dummy_mask,fscal);
883
884             /* Calculate temporary vectorial force */
885             tx               = _mm_mul_ps(fscal,dx00);
886             ty               = _mm_mul_ps(fscal,dy00);
887             tz               = _mm_mul_ps(fscal,dz00);
888
889             /* Update vectorial force */
890             fix0             = _mm_add_ps(fix0,tx);
891             fiy0             = _mm_add_ps(fiy0,ty);
892             fiz0             = _mm_add_ps(fiz0,tz);
893
894             fjx0             = _mm_add_ps(fjx0,tx);
895             fjy0             = _mm_add_ps(fjy0,ty);
896             fjz0             = _mm_add_ps(fjz0,tz);
897             
898             /**************************
899              * CALCULATE INTERACTIONS *
900              **************************/
901
902             r01              = _mm_mul_ps(rsq01,rinv01);
903             r01              = _mm_andnot_ps(dummy_mask,r01);
904
905             /* EWALD ELECTROSTATICS */
906
907             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
908             ewrt             = _mm_mul_ps(r01,ewtabscale);
909             ewitab           = _mm_cvttps_epi32(ewrt);
910             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
911             ewitab           = _mm_slli_epi32(ewitab,2);
912             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
913             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
914             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
915             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
916             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
917             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
918             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
919             velec            = _mm_mul_ps(qq01,_mm_sub_ps(rinv01,velec));
920             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
921
922             /* Update potential sum for this i atom from the interaction with this j atom. */
923             velec            = _mm_andnot_ps(dummy_mask,velec);
924             velecsum         = _mm_add_ps(velecsum,velec);
925
926             fscal            = felec;
927
928             fscal            = _mm_andnot_ps(dummy_mask,fscal);
929
930             /* Calculate temporary vectorial force */
931             tx               = _mm_mul_ps(fscal,dx01);
932             ty               = _mm_mul_ps(fscal,dy01);
933             tz               = _mm_mul_ps(fscal,dz01);
934
935             /* Update vectorial force */
936             fix0             = _mm_add_ps(fix0,tx);
937             fiy0             = _mm_add_ps(fiy0,ty);
938             fiz0             = _mm_add_ps(fiz0,tz);
939
940             fjx1             = _mm_add_ps(fjx1,tx);
941             fjy1             = _mm_add_ps(fjy1,ty);
942             fjz1             = _mm_add_ps(fjz1,tz);
943             
944             /**************************
945              * CALCULATE INTERACTIONS *
946              **************************/
947
948             r02              = _mm_mul_ps(rsq02,rinv02);
949             r02              = _mm_andnot_ps(dummy_mask,r02);
950
951             /* EWALD ELECTROSTATICS */
952
953             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
954             ewrt             = _mm_mul_ps(r02,ewtabscale);
955             ewitab           = _mm_cvttps_epi32(ewrt);
956             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
957             ewitab           = _mm_slli_epi32(ewitab,2);
958             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
959             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
960             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
961             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
962             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
963             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
964             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
965             velec            = _mm_mul_ps(qq02,_mm_sub_ps(rinv02,velec));
966             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
967
968             /* Update potential sum for this i atom from the interaction with this j atom. */
969             velec            = _mm_andnot_ps(dummy_mask,velec);
970             velecsum         = _mm_add_ps(velecsum,velec);
971
972             fscal            = felec;
973
974             fscal            = _mm_andnot_ps(dummy_mask,fscal);
975
976             /* Calculate temporary vectorial force */
977             tx               = _mm_mul_ps(fscal,dx02);
978             ty               = _mm_mul_ps(fscal,dy02);
979             tz               = _mm_mul_ps(fscal,dz02);
980
981             /* Update vectorial force */
982             fix0             = _mm_add_ps(fix0,tx);
983             fiy0             = _mm_add_ps(fiy0,ty);
984             fiz0             = _mm_add_ps(fiz0,tz);
985
986             fjx2             = _mm_add_ps(fjx2,tx);
987             fjy2             = _mm_add_ps(fjy2,ty);
988             fjz2             = _mm_add_ps(fjz2,tz);
989             
990             /**************************
991              * CALCULATE INTERACTIONS *
992              **************************/
993
994             r10              = _mm_mul_ps(rsq10,rinv10);
995             r10              = _mm_andnot_ps(dummy_mask,r10);
996
997             /* EWALD ELECTROSTATICS */
998
999             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1000             ewrt             = _mm_mul_ps(r10,ewtabscale);
1001             ewitab           = _mm_cvttps_epi32(ewrt);
1002             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1003             ewitab           = _mm_slli_epi32(ewitab,2);
1004             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1005             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1006             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1007             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1008             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1009             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1010             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1011             velec            = _mm_mul_ps(qq10,_mm_sub_ps(rinv10,velec));
1012             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1013
1014             /* Update potential sum for this i atom from the interaction with this j atom. */
1015             velec            = _mm_andnot_ps(dummy_mask,velec);
1016             velecsum         = _mm_add_ps(velecsum,velec);
1017
1018             fscal            = felec;
1019
1020             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1021
1022             /* Calculate temporary vectorial force */
1023             tx               = _mm_mul_ps(fscal,dx10);
1024             ty               = _mm_mul_ps(fscal,dy10);
1025             tz               = _mm_mul_ps(fscal,dz10);
1026
1027             /* Update vectorial force */
1028             fix1             = _mm_add_ps(fix1,tx);
1029             fiy1             = _mm_add_ps(fiy1,ty);
1030             fiz1             = _mm_add_ps(fiz1,tz);
1031
1032             fjx0             = _mm_add_ps(fjx0,tx);
1033             fjy0             = _mm_add_ps(fjy0,ty);
1034             fjz0             = _mm_add_ps(fjz0,tz);
1035             
1036             /**************************
1037              * CALCULATE INTERACTIONS *
1038              **************************/
1039
1040             r11              = _mm_mul_ps(rsq11,rinv11);
1041             r11              = _mm_andnot_ps(dummy_mask,r11);
1042
1043             /* EWALD ELECTROSTATICS */
1044
1045             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1046             ewrt             = _mm_mul_ps(r11,ewtabscale);
1047             ewitab           = _mm_cvttps_epi32(ewrt);
1048             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1049             ewitab           = _mm_slli_epi32(ewitab,2);
1050             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1051             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1052             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1053             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1054             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1055             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1056             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1057             velec            = _mm_mul_ps(qq11,_mm_sub_ps(rinv11,velec));
1058             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1059
1060             /* Update potential sum for this i atom from the interaction with this j atom. */
1061             velec            = _mm_andnot_ps(dummy_mask,velec);
1062             velecsum         = _mm_add_ps(velecsum,velec);
1063
1064             fscal            = felec;
1065
1066             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1067
1068             /* Calculate temporary vectorial force */
1069             tx               = _mm_mul_ps(fscal,dx11);
1070             ty               = _mm_mul_ps(fscal,dy11);
1071             tz               = _mm_mul_ps(fscal,dz11);
1072
1073             /* Update vectorial force */
1074             fix1             = _mm_add_ps(fix1,tx);
1075             fiy1             = _mm_add_ps(fiy1,ty);
1076             fiz1             = _mm_add_ps(fiz1,tz);
1077
1078             fjx1             = _mm_add_ps(fjx1,tx);
1079             fjy1             = _mm_add_ps(fjy1,ty);
1080             fjz1             = _mm_add_ps(fjz1,tz);
1081             
1082             /**************************
1083              * CALCULATE INTERACTIONS *
1084              **************************/
1085
1086             r12              = _mm_mul_ps(rsq12,rinv12);
1087             r12              = _mm_andnot_ps(dummy_mask,r12);
1088
1089             /* EWALD ELECTROSTATICS */
1090
1091             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1092             ewrt             = _mm_mul_ps(r12,ewtabscale);
1093             ewitab           = _mm_cvttps_epi32(ewrt);
1094             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1095             ewitab           = _mm_slli_epi32(ewitab,2);
1096             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1097             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1098             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1099             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1100             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1101             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1102             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1103             velec            = _mm_mul_ps(qq12,_mm_sub_ps(rinv12,velec));
1104             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1105
1106             /* Update potential sum for this i atom from the interaction with this j atom. */
1107             velec            = _mm_andnot_ps(dummy_mask,velec);
1108             velecsum         = _mm_add_ps(velecsum,velec);
1109
1110             fscal            = felec;
1111
1112             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1113
1114             /* Calculate temporary vectorial force */
1115             tx               = _mm_mul_ps(fscal,dx12);
1116             ty               = _mm_mul_ps(fscal,dy12);
1117             tz               = _mm_mul_ps(fscal,dz12);
1118
1119             /* Update vectorial force */
1120             fix1             = _mm_add_ps(fix1,tx);
1121             fiy1             = _mm_add_ps(fiy1,ty);
1122             fiz1             = _mm_add_ps(fiz1,tz);
1123
1124             fjx2             = _mm_add_ps(fjx2,tx);
1125             fjy2             = _mm_add_ps(fjy2,ty);
1126             fjz2             = _mm_add_ps(fjz2,tz);
1127             
1128             /**************************
1129              * CALCULATE INTERACTIONS *
1130              **************************/
1131
1132             r20              = _mm_mul_ps(rsq20,rinv20);
1133             r20              = _mm_andnot_ps(dummy_mask,r20);
1134
1135             /* EWALD ELECTROSTATICS */
1136
1137             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1138             ewrt             = _mm_mul_ps(r20,ewtabscale);
1139             ewitab           = _mm_cvttps_epi32(ewrt);
1140             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1141             ewitab           = _mm_slli_epi32(ewitab,2);
1142             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1143             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1144             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1145             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1146             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1147             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1148             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1149             velec            = _mm_mul_ps(qq20,_mm_sub_ps(rinv20,velec));
1150             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1151
1152             /* Update potential sum for this i atom from the interaction with this j atom. */
1153             velec            = _mm_andnot_ps(dummy_mask,velec);
1154             velecsum         = _mm_add_ps(velecsum,velec);
1155
1156             fscal            = felec;
1157
1158             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1159
1160             /* Calculate temporary vectorial force */
1161             tx               = _mm_mul_ps(fscal,dx20);
1162             ty               = _mm_mul_ps(fscal,dy20);
1163             tz               = _mm_mul_ps(fscal,dz20);
1164
1165             /* Update vectorial force */
1166             fix2             = _mm_add_ps(fix2,tx);
1167             fiy2             = _mm_add_ps(fiy2,ty);
1168             fiz2             = _mm_add_ps(fiz2,tz);
1169
1170             fjx0             = _mm_add_ps(fjx0,tx);
1171             fjy0             = _mm_add_ps(fjy0,ty);
1172             fjz0             = _mm_add_ps(fjz0,tz);
1173             
1174             /**************************
1175              * CALCULATE INTERACTIONS *
1176              **************************/
1177
1178             r21              = _mm_mul_ps(rsq21,rinv21);
1179             r21              = _mm_andnot_ps(dummy_mask,r21);
1180
1181             /* EWALD ELECTROSTATICS */
1182
1183             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1184             ewrt             = _mm_mul_ps(r21,ewtabscale);
1185             ewitab           = _mm_cvttps_epi32(ewrt);
1186             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1187             ewitab           = _mm_slli_epi32(ewitab,2);
1188             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1189             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1190             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1191             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1192             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1193             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1194             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1195             velec            = _mm_mul_ps(qq21,_mm_sub_ps(rinv21,velec));
1196             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1197
1198             /* Update potential sum for this i atom from the interaction with this j atom. */
1199             velec            = _mm_andnot_ps(dummy_mask,velec);
1200             velecsum         = _mm_add_ps(velecsum,velec);
1201
1202             fscal            = felec;
1203
1204             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1205
1206             /* Calculate temporary vectorial force */
1207             tx               = _mm_mul_ps(fscal,dx21);
1208             ty               = _mm_mul_ps(fscal,dy21);
1209             tz               = _mm_mul_ps(fscal,dz21);
1210
1211             /* Update vectorial force */
1212             fix2             = _mm_add_ps(fix2,tx);
1213             fiy2             = _mm_add_ps(fiy2,ty);
1214             fiz2             = _mm_add_ps(fiz2,tz);
1215
1216             fjx1             = _mm_add_ps(fjx1,tx);
1217             fjy1             = _mm_add_ps(fjy1,ty);
1218             fjz1             = _mm_add_ps(fjz1,tz);
1219             
1220             /**************************
1221              * CALCULATE INTERACTIONS *
1222              **************************/
1223
1224             r22              = _mm_mul_ps(rsq22,rinv22);
1225             r22              = _mm_andnot_ps(dummy_mask,r22);
1226
1227             /* EWALD ELECTROSTATICS */
1228
1229             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1230             ewrt             = _mm_mul_ps(r22,ewtabscale);
1231             ewitab           = _mm_cvttps_epi32(ewrt);
1232             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1233             ewitab           = _mm_slli_epi32(ewitab,2);
1234             ewtabF           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,0) );
1235             ewtabD           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,1) );
1236             ewtabV           = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,2) );
1237             ewtabFn          = _mm_load_ps( ewtab + gmx_mm_extract_epi32(ewitab,3) );
1238             _MM_TRANSPOSE4_PS(ewtabF,ewtabD,ewtabV,ewtabFn);
1239             felec            = _mm_add_ps(ewtabF,_mm_mul_ps(eweps,ewtabD));
1240             velec            = _mm_sub_ps(ewtabV,_mm_mul_ps(_mm_mul_ps(ewtabhalfspace,eweps),_mm_add_ps(ewtabF,felec)));
1241             velec            = _mm_mul_ps(qq22,_mm_sub_ps(rinv22,velec));
1242             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1243
1244             /* Update potential sum for this i atom from the interaction with this j atom. */
1245             velec            = _mm_andnot_ps(dummy_mask,velec);
1246             velecsum         = _mm_add_ps(velecsum,velec);
1247
1248             fscal            = felec;
1249
1250             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1251
1252             /* Calculate temporary vectorial force */
1253             tx               = _mm_mul_ps(fscal,dx22);
1254             ty               = _mm_mul_ps(fscal,dy22);
1255             tz               = _mm_mul_ps(fscal,dz22);
1256
1257             /* Update vectorial force */
1258             fix2             = _mm_add_ps(fix2,tx);
1259             fiy2             = _mm_add_ps(fiy2,ty);
1260             fiz2             = _mm_add_ps(fiz2,tz);
1261
1262             fjx2             = _mm_add_ps(fjx2,tx);
1263             fjy2             = _mm_add_ps(fjy2,ty);
1264             fjz2             = _mm_add_ps(fjz2,tz);
1265             
1266             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1267             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1268             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1269             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1270
1271             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1272                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1273
1274             /* Inner loop uses 412 flops */
1275         }
1276
1277         /* End of innermost loop */
1278
1279         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1280                                               f+i_coord_offset,fshift+i_shift_offset);
1281
1282         ggid                        = gid[iidx];
1283         /* Update potential energies */
1284         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1285         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1286
1287         /* Increment number of inner iterations */
1288         inneriter                  += j_index_end - j_index_start;
1289
1290         /* Outer loop uses 20 flops */
1291     }
1292
1293     /* Increment number of outer iterations */
1294     outeriter        += nri;
1295
1296     /* Update outer/inner flops */
1297
1298     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*412);
1299 }
1300 /*
1301  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_single
1302  * Electrostatics interaction: Ewald
1303  * VdW interaction:            CubicSplineTable
1304  * Geometry:                   Water3-Water3
1305  * Calculate force/pot:        Force
1306  */
1307 void
1308 nb_kernel_ElecEw_VdwCSTab_GeomW3W3_F_sse2_single
1309                     (t_nblist * gmx_restrict                nlist,
1310                      rvec * gmx_restrict                    xx,
1311                      rvec * gmx_restrict                    ff,
1312                      t_forcerec * gmx_restrict              fr,
1313                      t_mdatoms * gmx_restrict               mdatoms,
1314                      nb_kernel_data_t * gmx_restrict        kernel_data,
1315                      t_nrnb * gmx_restrict                  nrnb)
1316 {
1317     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1318      * just 0 for non-waters.
1319      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1320      * jnr indices corresponding to data put in the four positions in the SIMD register.
1321      */
1322     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1323     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1324     int              jnrA,jnrB,jnrC,jnrD;
1325     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1326     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1327     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1328     real             rcutoff_scalar;
1329     real             *shiftvec,*fshift,*x,*f;
1330     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1331     real             scratch[4*DIM];
1332     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1333     int              vdwioffset0;
1334     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1335     int              vdwioffset1;
1336     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1337     int              vdwioffset2;
1338     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1339     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1340     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1341     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1342     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1343     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1344     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1345     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1346     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1347     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1348     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1349     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1350     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1351     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1352     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1353     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1354     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1355     real             *charge;
1356     int              nvdwtype;
1357     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1358     int              *vdwtype;
1359     real             *vdwparam;
1360     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1361     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1362     __m128i          vfitab;
1363     __m128i          ifour       = _mm_set1_epi32(4);
1364     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1365     real             *vftab;
1366     __m128i          ewitab;
1367     __m128           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1368     real             *ewtab;
1369     __m128           dummy_mask,cutoff_mask;
1370     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1371     __m128           one     = _mm_set1_ps(1.0);
1372     __m128           two     = _mm_set1_ps(2.0);
1373     x                = xx[0];
1374     f                = ff[0];
1375
1376     nri              = nlist->nri;
1377     iinr             = nlist->iinr;
1378     jindex           = nlist->jindex;
1379     jjnr             = nlist->jjnr;
1380     shiftidx         = nlist->shift;
1381     gid              = nlist->gid;
1382     shiftvec         = fr->shift_vec[0];
1383     fshift           = fr->fshift[0];
1384     facel            = _mm_set1_ps(fr->epsfac);
1385     charge           = mdatoms->chargeA;
1386     nvdwtype         = fr->ntype;
1387     vdwparam         = fr->nbfp;
1388     vdwtype          = mdatoms->typeA;
1389
1390     vftab            = kernel_data->table_vdw->data;
1391     vftabscale       = _mm_set1_ps(kernel_data->table_vdw->scale);
1392
1393     sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
1394     ewtab            = fr->ic->tabq_coul_F;
1395     ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
1396     ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);
1397
1398     /* Setup water-specific parameters */
1399     inr              = nlist->iinr[0];
1400     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1401     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1402     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1403     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1404
1405     jq0              = _mm_set1_ps(charge[inr+0]);
1406     jq1              = _mm_set1_ps(charge[inr+1]);
1407     jq2              = _mm_set1_ps(charge[inr+2]);
1408     vdwjidx0A        = 2*vdwtype[inr+0];
1409     qq00             = _mm_mul_ps(iq0,jq0);
1410     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1411     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1412     qq01             = _mm_mul_ps(iq0,jq1);
1413     qq02             = _mm_mul_ps(iq0,jq2);
1414     qq10             = _mm_mul_ps(iq1,jq0);
1415     qq11             = _mm_mul_ps(iq1,jq1);
1416     qq12             = _mm_mul_ps(iq1,jq2);
1417     qq20             = _mm_mul_ps(iq2,jq0);
1418     qq21             = _mm_mul_ps(iq2,jq1);
1419     qq22             = _mm_mul_ps(iq2,jq2);
1420
1421     /* Avoid stupid compiler warnings */
1422     jnrA = jnrB = jnrC = jnrD = 0;
1423     j_coord_offsetA = 0;
1424     j_coord_offsetB = 0;
1425     j_coord_offsetC = 0;
1426     j_coord_offsetD = 0;
1427
1428     outeriter        = 0;
1429     inneriter        = 0;
1430
1431     for(iidx=0;iidx<4*DIM;iidx++)
1432     {
1433         scratch[iidx] = 0.0;
1434     }  
1435
1436     /* Start outer loop over neighborlists */
1437     for(iidx=0; iidx<nri; iidx++)
1438     {
1439         /* Load shift vector for this list */
1440         i_shift_offset   = DIM*shiftidx[iidx];
1441
1442         /* Load limits for loop over neighbors */
1443         j_index_start    = jindex[iidx];
1444         j_index_end      = jindex[iidx+1];
1445
1446         /* Get outer coordinate index */
1447         inr              = iinr[iidx];
1448         i_coord_offset   = DIM*inr;
1449
1450         /* Load i particle coords and add shift vector */
1451         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1452                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1453         
1454         fix0             = _mm_setzero_ps();
1455         fiy0             = _mm_setzero_ps();
1456         fiz0             = _mm_setzero_ps();
1457         fix1             = _mm_setzero_ps();
1458         fiy1             = _mm_setzero_ps();
1459         fiz1             = _mm_setzero_ps();
1460         fix2             = _mm_setzero_ps();
1461         fiy2             = _mm_setzero_ps();
1462         fiz2             = _mm_setzero_ps();
1463
1464         /* Start inner kernel loop */
1465         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1466         {
1467
1468             /* Get j neighbor index, and coordinate index */
1469             jnrA             = jjnr[jidx];
1470             jnrB             = jjnr[jidx+1];
1471             jnrC             = jjnr[jidx+2];
1472             jnrD             = jjnr[jidx+3];
1473             j_coord_offsetA  = DIM*jnrA;
1474             j_coord_offsetB  = DIM*jnrB;
1475             j_coord_offsetC  = DIM*jnrC;
1476             j_coord_offsetD  = DIM*jnrD;
1477
1478             /* load j atom coordinates */
1479             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1480                                               x+j_coord_offsetC,x+j_coord_offsetD,
1481                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1482
1483             /* Calculate displacement vector */
1484             dx00             = _mm_sub_ps(ix0,jx0);
1485             dy00             = _mm_sub_ps(iy0,jy0);
1486             dz00             = _mm_sub_ps(iz0,jz0);
1487             dx01             = _mm_sub_ps(ix0,jx1);
1488             dy01             = _mm_sub_ps(iy0,jy1);
1489             dz01             = _mm_sub_ps(iz0,jz1);
1490             dx02             = _mm_sub_ps(ix0,jx2);
1491             dy02             = _mm_sub_ps(iy0,jy2);
1492             dz02             = _mm_sub_ps(iz0,jz2);
1493             dx10             = _mm_sub_ps(ix1,jx0);
1494             dy10             = _mm_sub_ps(iy1,jy0);
1495             dz10             = _mm_sub_ps(iz1,jz0);
1496             dx11             = _mm_sub_ps(ix1,jx1);
1497             dy11             = _mm_sub_ps(iy1,jy1);
1498             dz11             = _mm_sub_ps(iz1,jz1);
1499             dx12             = _mm_sub_ps(ix1,jx2);
1500             dy12             = _mm_sub_ps(iy1,jy2);
1501             dz12             = _mm_sub_ps(iz1,jz2);
1502             dx20             = _mm_sub_ps(ix2,jx0);
1503             dy20             = _mm_sub_ps(iy2,jy0);
1504             dz20             = _mm_sub_ps(iz2,jz0);
1505             dx21             = _mm_sub_ps(ix2,jx1);
1506             dy21             = _mm_sub_ps(iy2,jy1);
1507             dz21             = _mm_sub_ps(iz2,jz1);
1508             dx22             = _mm_sub_ps(ix2,jx2);
1509             dy22             = _mm_sub_ps(iy2,jy2);
1510             dz22             = _mm_sub_ps(iz2,jz2);
1511
1512             /* Calculate squared distance and things based on it */
1513             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1514             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1515             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1516             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1517             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1518             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1519             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1520             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1521             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1522
1523             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1524             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1525             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1526             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1527             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1528             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1529             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1530             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1531             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1532
1533             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1534             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1535             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1536             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1537             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1538             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1539             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1540             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1541             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1542
1543             fjx0             = _mm_setzero_ps();
1544             fjy0             = _mm_setzero_ps();
1545             fjz0             = _mm_setzero_ps();
1546             fjx1             = _mm_setzero_ps();
1547             fjy1             = _mm_setzero_ps();
1548             fjz1             = _mm_setzero_ps();
1549             fjx2             = _mm_setzero_ps();
1550             fjy2             = _mm_setzero_ps();
1551             fjz2             = _mm_setzero_ps();
1552
1553             /**************************
1554              * CALCULATE INTERACTIONS *
1555              **************************/
1556
1557             r00              = _mm_mul_ps(rsq00,rinv00);
1558
1559             /* Calculate table index by multiplying r with table scale and truncate to integer */
1560             rt               = _mm_mul_ps(r00,vftabscale);
1561             vfitab           = _mm_cvttps_epi32(rt);
1562             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1563             vfitab           = _mm_slli_epi32(vfitab,3);
1564
1565             /* EWALD ELECTROSTATICS */
1566
1567             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1568             ewrt             = _mm_mul_ps(r00,ewtabscale);
1569             ewitab           = _mm_cvttps_epi32(ewrt);
1570             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1571             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1572                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1573                                          &ewtabF,&ewtabFn);
1574             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1575             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
1576
1577             /* CUBIC SPLINE TABLE DISPERSION */
1578             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1579             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1580             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1581             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1582             _MM_TRANSPOSE4_PS(Y,F,G,H);
1583             Heps             = _mm_mul_ps(vfeps,H);
1584             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1585             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1586             fvdw6            = _mm_mul_ps(c6_00,FF);
1587
1588             /* CUBIC SPLINE TABLE REPULSION */
1589             vfitab           = _mm_add_epi32(vfitab,ifour);
1590             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1591             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1592             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1593             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1594             _MM_TRANSPOSE4_PS(Y,F,G,H);
1595             Heps             = _mm_mul_ps(vfeps,H);
1596             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1597             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1598             fvdw12           = _mm_mul_ps(c12_00,FF);
1599             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1600
1601             fscal            = _mm_add_ps(felec,fvdw);
1602
1603             /* Calculate temporary vectorial force */
1604             tx               = _mm_mul_ps(fscal,dx00);
1605             ty               = _mm_mul_ps(fscal,dy00);
1606             tz               = _mm_mul_ps(fscal,dz00);
1607
1608             /* Update vectorial force */
1609             fix0             = _mm_add_ps(fix0,tx);
1610             fiy0             = _mm_add_ps(fiy0,ty);
1611             fiz0             = _mm_add_ps(fiz0,tz);
1612
1613             fjx0             = _mm_add_ps(fjx0,tx);
1614             fjy0             = _mm_add_ps(fjy0,ty);
1615             fjz0             = _mm_add_ps(fjz0,tz);
1616             
1617             /**************************
1618              * CALCULATE INTERACTIONS *
1619              **************************/
1620
1621             r01              = _mm_mul_ps(rsq01,rinv01);
1622
1623             /* EWALD ELECTROSTATICS */
1624
1625             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1626             ewrt             = _mm_mul_ps(r01,ewtabscale);
1627             ewitab           = _mm_cvttps_epi32(ewrt);
1628             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1629             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1630                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1631                                          &ewtabF,&ewtabFn);
1632             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1633             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
1634
1635             fscal            = felec;
1636
1637             /* Calculate temporary vectorial force */
1638             tx               = _mm_mul_ps(fscal,dx01);
1639             ty               = _mm_mul_ps(fscal,dy01);
1640             tz               = _mm_mul_ps(fscal,dz01);
1641
1642             /* Update vectorial force */
1643             fix0             = _mm_add_ps(fix0,tx);
1644             fiy0             = _mm_add_ps(fiy0,ty);
1645             fiz0             = _mm_add_ps(fiz0,tz);
1646
1647             fjx1             = _mm_add_ps(fjx1,tx);
1648             fjy1             = _mm_add_ps(fjy1,ty);
1649             fjz1             = _mm_add_ps(fjz1,tz);
1650             
1651             /**************************
1652              * CALCULATE INTERACTIONS *
1653              **************************/
1654
1655             r02              = _mm_mul_ps(rsq02,rinv02);
1656
1657             /* EWALD ELECTROSTATICS */
1658
1659             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1660             ewrt             = _mm_mul_ps(r02,ewtabscale);
1661             ewitab           = _mm_cvttps_epi32(ewrt);
1662             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1663             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1664                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1665                                          &ewtabF,&ewtabFn);
1666             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1667             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
1668
1669             fscal            = felec;
1670
1671             /* Calculate temporary vectorial force */
1672             tx               = _mm_mul_ps(fscal,dx02);
1673             ty               = _mm_mul_ps(fscal,dy02);
1674             tz               = _mm_mul_ps(fscal,dz02);
1675
1676             /* Update vectorial force */
1677             fix0             = _mm_add_ps(fix0,tx);
1678             fiy0             = _mm_add_ps(fiy0,ty);
1679             fiz0             = _mm_add_ps(fiz0,tz);
1680
1681             fjx2             = _mm_add_ps(fjx2,tx);
1682             fjy2             = _mm_add_ps(fjy2,ty);
1683             fjz2             = _mm_add_ps(fjz2,tz);
1684             
1685             /**************************
1686              * CALCULATE INTERACTIONS *
1687              **************************/
1688
1689             r10              = _mm_mul_ps(rsq10,rinv10);
1690
1691             /* EWALD ELECTROSTATICS */
1692
1693             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1694             ewrt             = _mm_mul_ps(r10,ewtabscale);
1695             ewitab           = _mm_cvttps_epi32(ewrt);
1696             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1697             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1698                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1699                                          &ewtabF,&ewtabFn);
1700             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1701             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
1702
1703             fscal            = felec;
1704
1705             /* Calculate temporary vectorial force */
1706             tx               = _mm_mul_ps(fscal,dx10);
1707             ty               = _mm_mul_ps(fscal,dy10);
1708             tz               = _mm_mul_ps(fscal,dz10);
1709
1710             /* Update vectorial force */
1711             fix1             = _mm_add_ps(fix1,tx);
1712             fiy1             = _mm_add_ps(fiy1,ty);
1713             fiz1             = _mm_add_ps(fiz1,tz);
1714
1715             fjx0             = _mm_add_ps(fjx0,tx);
1716             fjy0             = _mm_add_ps(fjy0,ty);
1717             fjz0             = _mm_add_ps(fjz0,tz);
1718             
1719             /**************************
1720              * CALCULATE INTERACTIONS *
1721              **************************/
1722
1723             r11              = _mm_mul_ps(rsq11,rinv11);
1724
1725             /* EWALD ELECTROSTATICS */
1726
1727             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1728             ewrt             = _mm_mul_ps(r11,ewtabscale);
1729             ewitab           = _mm_cvttps_epi32(ewrt);
1730             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1731             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1732                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1733                                          &ewtabF,&ewtabFn);
1734             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1735             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
1736
1737             fscal            = felec;
1738
1739             /* Calculate temporary vectorial force */
1740             tx               = _mm_mul_ps(fscal,dx11);
1741             ty               = _mm_mul_ps(fscal,dy11);
1742             tz               = _mm_mul_ps(fscal,dz11);
1743
1744             /* Update vectorial force */
1745             fix1             = _mm_add_ps(fix1,tx);
1746             fiy1             = _mm_add_ps(fiy1,ty);
1747             fiz1             = _mm_add_ps(fiz1,tz);
1748
1749             fjx1             = _mm_add_ps(fjx1,tx);
1750             fjy1             = _mm_add_ps(fjy1,ty);
1751             fjz1             = _mm_add_ps(fjz1,tz);
1752             
1753             /**************************
1754              * CALCULATE INTERACTIONS *
1755              **************************/
1756
1757             r12              = _mm_mul_ps(rsq12,rinv12);
1758
1759             /* EWALD ELECTROSTATICS */
1760
1761             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1762             ewrt             = _mm_mul_ps(r12,ewtabscale);
1763             ewitab           = _mm_cvttps_epi32(ewrt);
1764             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1765             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1766                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1767                                          &ewtabF,&ewtabFn);
1768             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1769             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
1770
1771             fscal            = felec;
1772
1773             /* Calculate temporary vectorial force */
1774             tx               = _mm_mul_ps(fscal,dx12);
1775             ty               = _mm_mul_ps(fscal,dy12);
1776             tz               = _mm_mul_ps(fscal,dz12);
1777
1778             /* Update vectorial force */
1779             fix1             = _mm_add_ps(fix1,tx);
1780             fiy1             = _mm_add_ps(fiy1,ty);
1781             fiz1             = _mm_add_ps(fiz1,tz);
1782
1783             fjx2             = _mm_add_ps(fjx2,tx);
1784             fjy2             = _mm_add_ps(fjy2,ty);
1785             fjz2             = _mm_add_ps(fjz2,tz);
1786             
1787             /**************************
1788              * CALCULATE INTERACTIONS *
1789              **************************/
1790
1791             r20              = _mm_mul_ps(rsq20,rinv20);
1792
1793             /* EWALD ELECTROSTATICS */
1794
1795             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1796             ewrt             = _mm_mul_ps(r20,ewtabscale);
1797             ewitab           = _mm_cvttps_epi32(ewrt);
1798             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1799             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1800                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1801                                          &ewtabF,&ewtabFn);
1802             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1803             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
1804
1805             fscal            = felec;
1806
1807             /* Calculate temporary vectorial force */
1808             tx               = _mm_mul_ps(fscal,dx20);
1809             ty               = _mm_mul_ps(fscal,dy20);
1810             tz               = _mm_mul_ps(fscal,dz20);
1811
1812             /* Update vectorial force */
1813             fix2             = _mm_add_ps(fix2,tx);
1814             fiy2             = _mm_add_ps(fiy2,ty);
1815             fiz2             = _mm_add_ps(fiz2,tz);
1816
1817             fjx0             = _mm_add_ps(fjx0,tx);
1818             fjy0             = _mm_add_ps(fjy0,ty);
1819             fjz0             = _mm_add_ps(fjz0,tz);
1820             
1821             /**************************
1822              * CALCULATE INTERACTIONS *
1823              **************************/
1824
1825             r21              = _mm_mul_ps(rsq21,rinv21);
1826
1827             /* EWALD ELECTROSTATICS */
1828
1829             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1830             ewrt             = _mm_mul_ps(r21,ewtabscale);
1831             ewitab           = _mm_cvttps_epi32(ewrt);
1832             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1833             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1834                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1835                                          &ewtabF,&ewtabFn);
1836             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1837             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
1838
1839             fscal            = felec;
1840
1841             /* Calculate temporary vectorial force */
1842             tx               = _mm_mul_ps(fscal,dx21);
1843             ty               = _mm_mul_ps(fscal,dy21);
1844             tz               = _mm_mul_ps(fscal,dz21);
1845
1846             /* Update vectorial force */
1847             fix2             = _mm_add_ps(fix2,tx);
1848             fiy2             = _mm_add_ps(fiy2,ty);
1849             fiz2             = _mm_add_ps(fiz2,tz);
1850
1851             fjx1             = _mm_add_ps(fjx1,tx);
1852             fjy1             = _mm_add_ps(fjy1,ty);
1853             fjz1             = _mm_add_ps(fjz1,tz);
1854             
1855             /**************************
1856              * CALCULATE INTERACTIONS *
1857              **************************/
1858
1859             r22              = _mm_mul_ps(rsq22,rinv22);
1860
1861             /* EWALD ELECTROSTATICS */
1862
1863             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1864             ewrt             = _mm_mul_ps(r22,ewtabscale);
1865             ewitab           = _mm_cvttps_epi32(ewrt);
1866             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
1867             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
1868                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
1869                                          &ewtabF,&ewtabFn);
1870             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
1871             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
1872
1873             fscal            = felec;
1874
1875             /* Calculate temporary vectorial force */
1876             tx               = _mm_mul_ps(fscal,dx22);
1877             ty               = _mm_mul_ps(fscal,dy22);
1878             tz               = _mm_mul_ps(fscal,dz22);
1879
1880             /* Update vectorial force */
1881             fix2             = _mm_add_ps(fix2,tx);
1882             fiy2             = _mm_add_ps(fiy2,ty);
1883             fiz2             = _mm_add_ps(fiz2,tz);
1884
1885             fjx2             = _mm_add_ps(fjx2,tx);
1886             fjy2             = _mm_add_ps(fjy2,ty);
1887             fjz2             = _mm_add_ps(fjz2,tz);
1888             
1889             fjptrA             = f+j_coord_offsetA;
1890             fjptrB             = f+j_coord_offsetB;
1891             fjptrC             = f+j_coord_offsetC;
1892             fjptrD             = f+j_coord_offsetD;
1893
1894             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1895                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1896
1897             /* Inner loop uses 350 flops */
1898         }
1899
1900         if(jidx<j_index_end)
1901         {
1902
1903             /* Get j neighbor index, and coordinate index */
1904             jnrlistA         = jjnr[jidx];
1905             jnrlistB         = jjnr[jidx+1];
1906             jnrlistC         = jjnr[jidx+2];
1907             jnrlistD         = jjnr[jidx+3];
1908             /* Sign of each element will be negative for non-real atoms.
1909              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1910              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1911              */
1912             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1913             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1914             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1915             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1916             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1917             j_coord_offsetA  = DIM*jnrA;
1918             j_coord_offsetB  = DIM*jnrB;
1919             j_coord_offsetC  = DIM*jnrC;
1920             j_coord_offsetD  = DIM*jnrD;
1921
1922             /* load j atom coordinates */
1923             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1924                                               x+j_coord_offsetC,x+j_coord_offsetD,
1925                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1926
1927             /* Calculate displacement vector */
1928             dx00             = _mm_sub_ps(ix0,jx0);
1929             dy00             = _mm_sub_ps(iy0,jy0);
1930             dz00             = _mm_sub_ps(iz0,jz0);
1931             dx01             = _mm_sub_ps(ix0,jx1);
1932             dy01             = _mm_sub_ps(iy0,jy1);
1933             dz01             = _mm_sub_ps(iz0,jz1);
1934             dx02             = _mm_sub_ps(ix0,jx2);
1935             dy02             = _mm_sub_ps(iy0,jy2);
1936             dz02             = _mm_sub_ps(iz0,jz2);
1937             dx10             = _mm_sub_ps(ix1,jx0);
1938             dy10             = _mm_sub_ps(iy1,jy0);
1939             dz10             = _mm_sub_ps(iz1,jz0);
1940             dx11             = _mm_sub_ps(ix1,jx1);
1941             dy11             = _mm_sub_ps(iy1,jy1);
1942             dz11             = _mm_sub_ps(iz1,jz1);
1943             dx12             = _mm_sub_ps(ix1,jx2);
1944             dy12             = _mm_sub_ps(iy1,jy2);
1945             dz12             = _mm_sub_ps(iz1,jz2);
1946             dx20             = _mm_sub_ps(ix2,jx0);
1947             dy20             = _mm_sub_ps(iy2,jy0);
1948             dz20             = _mm_sub_ps(iz2,jz0);
1949             dx21             = _mm_sub_ps(ix2,jx1);
1950             dy21             = _mm_sub_ps(iy2,jy1);
1951             dz21             = _mm_sub_ps(iz2,jz1);
1952             dx22             = _mm_sub_ps(ix2,jx2);
1953             dy22             = _mm_sub_ps(iy2,jy2);
1954             dz22             = _mm_sub_ps(iz2,jz2);
1955
1956             /* Calculate squared distance and things based on it */
1957             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1958             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1959             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1960             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1961             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1962             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1963             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1964             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1965             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1966
1967             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1968             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1969             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1970             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1971             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1972             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1973             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1974             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1975             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1976
1977             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1978             rinvsq01         = _mm_mul_ps(rinv01,rinv01);
1979             rinvsq02         = _mm_mul_ps(rinv02,rinv02);
1980             rinvsq10         = _mm_mul_ps(rinv10,rinv10);
1981             rinvsq11         = _mm_mul_ps(rinv11,rinv11);
1982             rinvsq12         = _mm_mul_ps(rinv12,rinv12);
1983             rinvsq20         = _mm_mul_ps(rinv20,rinv20);
1984             rinvsq21         = _mm_mul_ps(rinv21,rinv21);
1985             rinvsq22         = _mm_mul_ps(rinv22,rinv22);
1986
1987             fjx0             = _mm_setzero_ps();
1988             fjy0             = _mm_setzero_ps();
1989             fjz0             = _mm_setzero_ps();
1990             fjx1             = _mm_setzero_ps();
1991             fjy1             = _mm_setzero_ps();
1992             fjz1             = _mm_setzero_ps();
1993             fjx2             = _mm_setzero_ps();
1994             fjy2             = _mm_setzero_ps();
1995             fjz2             = _mm_setzero_ps();
1996
1997             /**************************
1998              * CALCULATE INTERACTIONS *
1999              **************************/
2000
2001             r00              = _mm_mul_ps(rsq00,rinv00);
2002             r00              = _mm_andnot_ps(dummy_mask,r00);
2003
2004             /* Calculate table index by multiplying r with table scale and truncate to integer */
2005             rt               = _mm_mul_ps(r00,vftabscale);
2006             vfitab           = _mm_cvttps_epi32(rt);
2007             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2008             vfitab           = _mm_slli_epi32(vfitab,3);
2009
2010             /* EWALD ELECTROSTATICS */
2011
2012             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2013             ewrt             = _mm_mul_ps(r00,ewtabscale);
2014             ewitab           = _mm_cvttps_epi32(ewrt);
2015             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2016             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2017                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2018                                          &ewtabF,&ewtabFn);
2019             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2020             felec            = _mm_mul_ps(_mm_mul_ps(qq00,rinv00),_mm_sub_ps(rinvsq00,felec));
2021
2022             /* CUBIC SPLINE TABLE DISPERSION */
2023             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2024             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2025             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2026             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2027             _MM_TRANSPOSE4_PS(Y,F,G,H);
2028             Heps             = _mm_mul_ps(vfeps,H);
2029             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2030             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2031             fvdw6            = _mm_mul_ps(c6_00,FF);
2032
2033             /* CUBIC SPLINE TABLE REPULSION */
2034             vfitab           = _mm_add_epi32(vfitab,ifour);
2035             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2036             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2037             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2038             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2039             _MM_TRANSPOSE4_PS(Y,F,G,H);
2040             Heps             = _mm_mul_ps(vfeps,H);
2041             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2042             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2043             fvdw12           = _mm_mul_ps(c12_00,FF);
2044             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2045
2046             fscal            = _mm_add_ps(felec,fvdw);
2047
2048             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2049
2050             /* Calculate temporary vectorial force */
2051             tx               = _mm_mul_ps(fscal,dx00);
2052             ty               = _mm_mul_ps(fscal,dy00);
2053             tz               = _mm_mul_ps(fscal,dz00);
2054
2055             /* Update vectorial force */
2056             fix0             = _mm_add_ps(fix0,tx);
2057             fiy0             = _mm_add_ps(fiy0,ty);
2058             fiz0             = _mm_add_ps(fiz0,tz);
2059
2060             fjx0             = _mm_add_ps(fjx0,tx);
2061             fjy0             = _mm_add_ps(fjy0,ty);
2062             fjz0             = _mm_add_ps(fjz0,tz);
2063             
2064             /**************************
2065              * CALCULATE INTERACTIONS *
2066              **************************/
2067
2068             r01              = _mm_mul_ps(rsq01,rinv01);
2069             r01              = _mm_andnot_ps(dummy_mask,r01);
2070
2071             /* EWALD ELECTROSTATICS */
2072
2073             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2074             ewrt             = _mm_mul_ps(r01,ewtabscale);
2075             ewitab           = _mm_cvttps_epi32(ewrt);
2076             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2077             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2078                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2079                                          &ewtabF,&ewtabFn);
2080             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2081             felec            = _mm_mul_ps(_mm_mul_ps(qq01,rinv01),_mm_sub_ps(rinvsq01,felec));
2082
2083             fscal            = felec;
2084
2085             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2086
2087             /* Calculate temporary vectorial force */
2088             tx               = _mm_mul_ps(fscal,dx01);
2089             ty               = _mm_mul_ps(fscal,dy01);
2090             tz               = _mm_mul_ps(fscal,dz01);
2091
2092             /* Update vectorial force */
2093             fix0             = _mm_add_ps(fix0,tx);
2094             fiy0             = _mm_add_ps(fiy0,ty);
2095             fiz0             = _mm_add_ps(fiz0,tz);
2096
2097             fjx1             = _mm_add_ps(fjx1,tx);
2098             fjy1             = _mm_add_ps(fjy1,ty);
2099             fjz1             = _mm_add_ps(fjz1,tz);
2100             
2101             /**************************
2102              * CALCULATE INTERACTIONS *
2103              **************************/
2104
2105             r02              = _mm_mul_ps(rsq02,rinv02);
2106             r02              = _mm_andnot_ps(dummy_mask,r02);
2107
2108             /* EWALD ELECTROSTATICS */
2109
2110             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2111             ewrt             = _mm_mul_ps(r02,ewtabscale);
2112             ewitab           = _mm_cvttps_epi32(ewrt);
2113             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2114             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2115                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2116                                          &ewtabF,&ewtabFn);
2117             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2118             felec            = _mm_mul_ps(_mm_mul_ps(qq02,rinv02),_mm_sub_ps(rinvsq02,felec));
2119
2120             fscal            = felec;
2121
2122             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2123
2124             /* Calculate temporary vectorial force */
2125             tx               = _mm_mul_ps(fscal,dx02);
2126             ty               = _mm_mul_ps(fscal,dy02);
2127             tz               = _mm_mul_ps(fscal,dz02);
2128
2129             /* Update vectorial force */
2130             fix0             = _mm_add_ps(fix0,tx);
2131             fiy0             = _mm_add_ps(fiy0,ty);
2132             fiz0             = _mm_add_ps(fiz0,tz);
2133
2134             fjx2             = _mm_add_ps(fjx2,tx);
2135             fjy2             = _mm_add_ps(fjy2,ty);
2136             fjz2             = _mm_add_ps(fjz2,tz);
2137             
2138             /**************************
2139              * CALCULATE INTERACTIONS *
2140              **************************/
2141
2142             r10              = _mm_mul_ps(rsq10,rinv10);
2143             r10              = _mm_andnot_ps(dummy_mask,r10);
2144
2145             /* EWALD ELECTROSTATICS */
2146
2147             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2148             ewrt             = _mm_mul_ps(r10,ewtabscale);
2149             ewitab           = _mm_cvttps_epi32(ewrt);
2150             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2151             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2152                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2153                                          &ewtabF,&ewtabFn);
2154             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2155             felec            = _mm_mul_ps(_mm_mul_ps(qq10,rinv10),_mm_sub_ps(rinvsq10,felec));
2156
2157             fscal            = felec;
2158
2159             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2160
2161             /* Calculate temporary vectorial force */
2162             tx               = _mm_mul_ps(fscal,dx10);
2163             ty               = _mm_mul_ps(fscal,dy10);
2164             tz               = _mm_mul_ps(fscal,dz10);
2165
2166             /* Update vectorial force */
2167             fix1             = _mm_add_ps(fix1,tx);
2168             fiy1             = _mm_add_ps(fiy1,ty);
2169             fiz1             = _mm_add_ps(fiz1,tz);
2170
2171             fjx0             = _mm_add_ps(fjx0,tx);
2172             fjy0             = _mm_add_ps(fjy0,ty);
2173             fjz0             = _mm_add_ps(fjz0,tz);
2174             
2175             /**************************
2176              * CALCULATE INTERACTIONS *
2177              **************************/
2178
2179             r11              = _mm_mul_ps(rsq11,rinv11);
2180             r11              = _mm_andnot_ps(dummy_mask,r11);
2181
2182             /* EWALD ELECTROSTATICS */
2183
2184             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2185             ewrt             = _mm_mul_ps(r11,ewtabscale);
2186             ewitab           = _mm_cvttps_epi32(ewrt);
2187             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2188             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2189                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2190                                          &ewtabF,&ewtabFn);
2191             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2192             felec            = _mm_mul_ps(_mm_mul_ps(qq11,rinv11),_mm_sub_ps(rinvsq11,felec));
2193
2194             fscal            = felec;
2195
2196             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2197
2198             /* Calculate temporary vectorial force */
2199             tx               = _mm_mul_ps(fscal,dx11);
2200             ty               = _mm_mul_ps(fscal,dy11);
2201             tz               = _mm_mul_ps(fscal,dz11);
2202
2203             /* Update vectorial force */
2204             fix1             = _mm_add_ps(fix1,tx);
2205             fiy1             = _mm_add_ps(fiy1,ty);
2206             fiz1             = _mm_add_ps(fiz1,tz);
2207
2208             fjx1             = _mm_add_ps(fjx1,tx);
2209             fjy1             = _mm_add_ps(fjy1,ty);
2210             fjz1             = _mm_add_ps(fjz1,tz);
2211             
2212             /**************************
2213              * CALCULATE INTERACTIONS *
2214              **************************/
2215
2216             r12              = _mm_mul_ps(rsq12,rinv12);
2217             r12              = _mm_andnot_ps(dummy_mask,r12);
2218
2219             /* EWALD ELECTROSTATICS */
2220
2221             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2222             ewrt             = _mm_mul_ps(r12,ewtabscale);
2223             ewitab           = _mm_cvttps_epi32(ewrt);
2224             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2225             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2226                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2227                                          &ewtabF,&ewtabFn);
2228             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2229             felec            = _mm_mul_ps(_mm_mul_ps(qq12,rinv12),_mm_sub_ps(rinvsq12,felec));
2230
2231             fscal            = felec;
2232
2233             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2234
2235             /* Calculate temporary vectorial force */
2236             tx               = _mm_mul_ps(fscal,dx12);
2237             ty               = _mm_mul_ps(fscal,dy12);
2238             tz               = _mm_mul_ps(fscal,dz12);
2239
2240             /* Update vectorial force */
2241             fix1             = _mm_add_ps(fix1,tx);
2242             fiy1             = _mm_add_ps(fiy1,ty);
2243             fiz1             = _mm_add_ps(fiz1,tz);
2244
2245             fjx2             = _mm_add_ps(fjx2,tx);
2246             fjy2             = _mm_add_ps(fjy2,ty);
2247             fjz2             = _mm_add_ps(fjz2,tz);
2248             
2249             /**************************
2250              * CALCULATE INTERACTIONS *
2251              **************************/
2252
2253             r20              = _mm_mul_ps(rsq20,rinv20);
2254             r20              = _mm_andnot_ps(dummy_mask,r20);
2255
2256             /* EWALD ELECTROSTATICS */
2257
2258             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2259             ewrt             = _mm_mul_ps(r20,ewtabscale);
2260             ewitab           = _mm_cvttps_epi32(ewrt);
2261             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2262             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2263                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2264                                          &ewtabF,&ewtabFn);
2265             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2266             felec            = _mm_mul_ps(_mm_mul_ps(qq20,rinv20),_mm_sub_ps(rinvsq20,felec));
2267
2268             fscal            = felec;
2269
2270             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2271
2272             /* Calculate temporary vectorial force */
2273             tx               = _mm_mul_ps(fscal,dx20);
2274             ty               = _mm_mul_ps(fscal,dy20);
2275             tz               = _mm_mul_ps(fscal,dz20);
2276
2277             /* Update vectorial force */
2278             fix2             = _mm_add_ps(fix2,tx);
2279             fiy2             = _mm_add_ps(fiy2,ty);
2280             fiz2             = _mm_add_ps(fiz2,tz);
2281
2282             fjx0             = _mm_add_ps(fjx0,tx);
2283             fjy0             = _mm_add_ps(fjy0,ty);
2284             fjz0             = _mm_add_ps(fjz0,tz);
2285             
2286             /**************************
2287              * CALCULATE INTERACTIONS *
2288              **************************/
2289
2290             r21              = _mm_mul_ps(rsq21,rinv21);
2291             r21              = _mm_andnot_ps(dummy_mask,r21);
2292
2293             /* EWALD ELECTROSTATICS */
2294
2295             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2296             ewrt             = _mm_mul_ps(r21,ewtabscale);
2297             ewitab           = _mm_cvttps_epi32(ewrt);
2298             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2299             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2300                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2301                                          &ewtabF,&ewtabFn);
2302             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2303             felec            = _mm_mul_ps(_mm_mul_ps(qq21,rinv21),_mm_sub_ps(rinvsq21,felec));
2304
2305             fscal            = felec;
2306
2307             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2308
2309             /* Calculate temporary vectorial force */
2310             tx               = _mm_mul_ps(fscal,dx21);
2311             ty               = _mm_mul_ps(fscal,dy21);
2312             tz               = _mm_mul_ps(fscal,dz21);
2313
2314             /* Update vectorial force */
2315             fix2             = _mm_add_ps(fix2,tx);
2316             fiy2             = _mm_add_ps(fiy2,ty);
2317             fiz2             = _mm_add_ps(fiz2,tz);
2318
2319             fjx1             = _mm_add_ps(fjx1,tx);
2320             fjy1             = _mm_add_ps(fjy1,ty);
2321             fjz1             = _mm_add_ps(fjz1,tz);
2322             
2323             /**************************
2324              * CALCULATE INTERACTIONS *
2325              **************************/
2326
2327             r22              = _mm_mul_ps(rsq22,rinv22);
2328             r22              = _mm_andnot_ps(dummy_mask,r22);
2329
2330             /* EWALD ELECTROSTATICS */
2331
2332             /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
2333             ewrt             = _mm_mul_ps(r22,ewtabscale);
2334             ewitab           = _mm_cvttps_epi32(ewrt);
2335             eweps            = _mm_sub_ps(ewrt,_mm_cvtepi32_ps(ewitab));
2336             gmx_mm_load_4pair_swizzle_ps(ewtab+gmx_mm_extract_epi32(ewitab,0),ewtab+gmx_mm_extract_epi32(ewitab,1),
2337                                          ewtab+gmx_mm_extract_epi32(ewitab,2),ewtab+gmx_mm_extract_epi32(ewitab,3),
2338                                          &ewtabF,&ewtabFn);
2339             felec            = _mm_add_ps(_mm_mul_ps( _mm_sub_ps(one,eweps),ewtabF),_mm_mul_ps(eweps,ewtabFn));
2340             felec            = _mm_mul_ps(_mm_mul_ps(qq22,rinv22),_mm_sub_ps(rinvsq22,felec));
2341
2342             fscal            = felec;
2343
2344             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2345
2346             /* Calculate temporary vectorial force */
2347             tx               = _mm_mul_ps(fscal,dx22);
2348             ty               = _mm_mul_ps(fscal,dy22);
2349             tz               = _mm_mul_ps(fscal,dz22);
2350
2351             /* Update vectorial force */
2352             fix2             = _mm_add_ps(fix2,tx);
2353             fiy2             = _mm_add_ps(fiy2,ty);
2354             fiz2             = _mm_add_ps(fiz2,tz);
2355
2356             fjx2             = _mm_add_ps(fjx2,tx);
2357             fjy2             = _mm_add_ps(fjy2,ty);
2358             fjz2             = _mm_add_ps(fjz2,tz);
2359             
2360             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2361             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2362             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2363             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2364
2365             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2366                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2367
2368             /* Inner loop uses 359 flops */
2369         }
2370
2371         /* End of innermost loop */
2372
2373         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2374                                               f+i_coord_offset,fshift+i_shift_offset);
2375
2376         /* Increment number of inner iterations */
2377         inneriter                  += j_index_end - j_index_start;
2378
2379         /* Outer loop uses 18 flops */
2380     }
2381
2382     /* Increment number of outer iterations */
2383     outeriter        += nri;
2384
2385     /* Update outer/inner flops */
2386
2387     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*359);
2388 }