Added option to gmx nmeig to print ZPE.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_sse2_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_single
51  * Electrostatics interaction: CubicSplineTable
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_sse2_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
76     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
77     real             rcutoff_scalar;
78     real             *shiftvec,*fshift,*x,*f;
79     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
80     real             scratch[4*DIM];
81     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
82     int              vdwioffset0;
83     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
84     int              vdwioffset1;
85     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
86     int              vdwioffset2;
87     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
88     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
89     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
91     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
92     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
93     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
94     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
95     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
96     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
97     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
104     real             *charge;
105     int              nvdwtype;
106     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
107     int              *vdwtype;
108     real             *vdwparam;
109     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
110     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
111     __m128i          vfitab;
112     __m128i          ifour       = _mm_set1_epi32(4);
113     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
114     real             *vftab;
115     __m128           dummy_mask,cutoff_mask;
116     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
117     __m128           one     = _mm_set1_ps(1.0);
118     __m128           two     = _mm_set1_ps(2.0);
119     x                = xx[0];
120     f                = ff[0];
121
122     nri              = nlist->nri;
123     iinr             = nlist->iinr;
124     jindex           = nlist->jindex;
125     jjnr             = nlist->jjnr;
126     shiftidx         = nlist->shift;
127     gid              = nlist->gid;
128     shiftvec         = fr->shift_vec[0];
129     fshift           = fr->fshift[0];
130     facel            = _mm_set1_ps(fr->ic->epsfac);
131     charge           = mdatoms->chargeA;
132     nvdwtype         = fr->ntype;
133     vdwparam         = fr->nbfp;
134     vdwtype          = mdatoms->typeA;
135
136     vftab            = kernel_data->table_elec->data;
137     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
138
139     /* Setup water-specific parameters */
140     inr              = nlist->iinr[0];
141     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
142     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
143     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
144     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
145
146     jq0              = _mm_set1_ps(charge[inr+0]);
147     jq1              = _mm_set1_ps(charge[inr+1]);
148     jq2              = _mm_set1_ps(charge[inr+2]);
149     vdwjidx0A        = 2*vdwtype[inr+0];
150     qq00             = _mm_mul_ps(iq0,jq0);
151     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
152     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
153     qq01             = _mm_mul_ps(iq0,jq1);
154     qq02             = _mm_mul_ps(iq0,jq2);
155     qq10             = _mm_mul_ps(iq1,jq0);
156     qq11             = _mm_mul_ps(iq1,jq1);
157     qq12             = _mm_mul_ps(iq1,jq2);
158     qq20             = _mm_mul_ps(iq2,jq0);
159     qq21             = _mm_mul_ps(iq2,jq1);
160     qq22             = _mm_mul_ps(iq2,jq2);
161
162     /* Avoid stupid compiler warnings */
163     jnrA = jnrB = jnrC = jnrD = 0;
164     j_coord_offsetA = 0;
165     j_coord_offsetB = 0;
166     j_coord_offsetC = 0;
167     j_coord_offsetD = 0;
168
169     outeriter        = 0;
170     inneriter        = 0;
171
172     for(iidx=0;iidx<4*DIM;iidx++)
173     {
174         scratch[iidx] = 0.0;
175     }  
176
177     /* Start outer loop over neighborlists */
178     for(iidx=0; iidx<nri; iidx++)
179     {
180         /* Load shift vector for this list */
181         i_shift_offset   = DIM*shiftidx[iidx];
182
183         /* Load limits for loop over neighbors */
184         j_index_start    = jindex[iidx];
185         j_index_end      = jindex[iidx+1];
186
187         /* Get outer coordinate index */
188         inr              = iinr[iidx];
189         i_coord_offset   = DIM*inr;
190
191         /* Load i particle coords and add shift vector */
192         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
193                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
194         
195         fix0             = _mm_setzero_ps();
196         fiy0             = _mm_setzero_ps();
197         fiz0             = _mm_setzero_ps();
198         fix1             = _mm_setzero_ps();
199         fiy1             = _mm_setzero_ps();
200         fiz1             = _mm_setzero_ps();
201         fix2             = _mm_setzero_ps();
202         fiy2             = _mm_setzero_ps();
203         fiz2             = _mm_setzero_ps();
204
205         /* Reset potential sums */
206         velecsum         = _mm_setzero_ps();
207         vvdwsum          = _mm_setzero_ps();
208
209         /* Start inner kernel loop */
210         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
211         {
212
213             /* Get j neighbor index, and coordinate index */
214             jnrA             = jjnr[jidx];
215             jnrB             = jjnr[jidx+1];
216             jnrC             = jjnr[jidx+2];
217             jnrD             = jjnr[jidx+3];
218             j_coord_offsetA  = DIM*jnrA;
219             j_coord_offsetB  = DIM*jnrB;
220             j_coord_offsetC  = DIM*jnrC;
221             j_coord_offsetD  = DIM*jnrD;
222
223             /* load j atom coordinates */
224             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
225                                               x+j_coord_offsetC,x+j_coord_offsetD,
226                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
227
228             /* Calculate displacement vector */
229             dx00             = _mm_sub_ps(ix0,jx0);
230             dy00             = _mm_sub_ps(iy0,jy0);
231             dz00             = _mm_sub_ps(iz0,jz0);
232             dx01             = _mm_sub_ps(ix0,jx1);
233             dy01             = _mm_sub_ps(iy0,jy1);
234             dz01             = _mm_sub_ps(iz0,jz1);
235             dx02             = _mm_sub_ps(ix0,jx2);
236             dy02             = _mm_sub_ps(iy0,jy2);
237             dz02             = _mm_sub_ps(iz0,jz2);
238             dx10             = _mm_sub_ps(ix1,jx0);
239             dy10             = _mm_sub_ps(iy1,jy0);
240             dz10             = _mm_sub_ps(iz1,jz0);
241             dx11             = _mm_sub_ps(ix1,jx1);
242             dy11             = _mm_sub_ps(iy1,jy1);
243             dz11             = _mm_sub_ps(iz1,jz1);
244             dx12             = _mm_sub_ps(ix1,jx2);
245             dy12             = _mm_sub_ps(iy1,jy2);
246             dz12             = _mm_sub_ps(iz1,jz2);
247             dx20             = _mm_sub_ps(ix2,jx0);
248             dy20             = _mm_sub_ps(iy2,jy0);
249             dz20             = _mm_sub_ps(iz2,jz0);
250             dx21             = _mm_sub_ps(ix2,jx1);
251             dy21             = _mm_sub_ps(iy2,jy1);
252             dz21             = _mm_sub_ps(iz2,jz1);
253             dx22             = _mm_sub_ps(ix2,jx2);
254             dy22             = _mm_sub_ps(iy2,jy2);
255             dz22             = _mm_sub_ps(iz2,jz2);
256
257             /* Calculate squared distance and things based on it */
258             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
259             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
260             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
261             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
262             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
263             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
264             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
265             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
266             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
267
268             rinv00           = sse2_invsqrt_f(rsq00);
269             rinv01           = sse2_invsqrt_f(rsq01);
270             rinv02           = sse2_invsqrt_f(rsq02);
271             rinv10           = sse2_invsqrt_f(rsq10);
272             rinv11           = sse2_invsqrt_f(rsq11);
273             rinv12           = sse2_invsqrt_f(rsq12);
274             rinv20           = sse2_invsqrt_f(rsq20);
275             rinv21           = sse2_invsqrt_f(rsq21);
276             rinv22           = sse2_invsqrt_f(rsq22);
277
278             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
279
280             fjx0             = _mm_setzero_ps();
281             fjy0             = _mm_setzero_ps();
282             fjz0             = _mm_setzero_ps();
283             fjx1             = _mm_setzero_ps();
284             fjy1             = _mm_setzero_ps();
285             fjz1             = _mm_setzero_ps();
286             fjx2             = _mm_setzero_ps();
287             fjy2             = _mm_setzero_ps();
288             fjz2             = _mm_setzero_ps();
289
290             /**************************
291              * CALCULATE INTERACTIONS *
292              **************************/
293
294             r00              = _mm_mul_ps(rsq00,rinv00);
295
296             /* Calculate table index by multiplying r with table scale and truncate to integer */
297             rt               = _mm_mul_ps(r00,vftabscale);
298             vfitab           = _mm_cvttps_epi32(rt);
299             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
300             vfitab           = _mm_slli_epi32(vfitab,2);
301
302             /* CUBIC SPLINE TABLE ELECTROSTATICS */
303             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
304             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
305             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
306             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
307             _MM_TRANSPOSE4_PS(Y,F,G,H);
308             Heps             = _mm_mul_ps(vfeps,H);
309             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
310             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
311             velec            = _mm_mul_ps(qq00,VV);
312             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
313             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
314
315             /* LENNARD-JONES DISPERSION/REPULSION */
316
317             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
318             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
319             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
320             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
321             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
322
323             /* Update potential sum for this i atom from the interaction with this j atom. */
324             velecsum         = _mm_add_ps(velecsum,velec);
325             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
326
327             fscal            = _mm_add_ps(felec,fvdw);
328
329             /* Calculate temporary vectorial force */
330             tx               = _mm_mul_ps(fscal,dx00);
331             ty               = _mm_mul_ps(fscal,dy00);
332             tz               = _mm_mul_ps(fscal,dz00);
333
334             /* Update vectorial force */
335             fix0             = _mm_add_ps(fix0,tx);
336             fiy0             = _mm_add_ps(fiy0,ty);
337             fiz0             = _mm_add_ps(fiz0,tz);
338
339             fjx0             = _mm_add_ps(fjx0,tx);
340             fjy0             = _mm_add_ps(fjy0,ty);
341             fjz0             = _mm_add_ps(fjz0,tz);
342             
343             /**************************
344              * CALCULATE INTERACTIONS *
345              **************************/
346
347             r01              = _mm_mul_ps(rsq01,rinv01);
348
349             /* Calculate table index by multiplying r with table scale and truncate to integer */
350             rt               = _mm_mul_ps(r01,vftabscale);
351             vfitab           = _mm_cvttps_epi32(rt);
352             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
353             vfitab           = _mm_slli_epi32(vfitab,2);
354
355             /* CUBIC SPLINE TABLE ELECTROSTATICS */
356             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
357             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
358             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
359             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
360             _MM_TRANSPOSE4_PS(Y,F,G,H);
361             Heps             = _mm_mul_ps(vfeps,H);
362             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
363             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
364             velec            = _mm_mul_ps(qq01,VV);
365             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
366             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
367
368             /* Update potential sum for this i atom from the interaction with this j atom. */
369             velecsum         = _mm_add_ps(velecsum,velec);
370
371             fscal            = felec;
372
373             /* Calculate temporary vectorial force */
374             tx               = _mm_mul_ps(fscal,dx01);
375             ty               = _mm_mul_ps(fscal,dy01);
376             tz               = _mm_mul_ps(fscal,dz01);
377
378             /* Update vectorial force */
379             fix0             = _mm_add_ps(fix0,tx);
380             fiy0             = _mm_add_ps(fiy0,ty);
381             fiz0             = _mm_add_ps(fiz0,tz);
382
383             fjx1             = _mm_add_ps(fjx1,tx);
384             fjy1             = _mm_add_ps(fjy1,ty);
385             fjz1             = _mm_add_ps(fjz1,tz);
386             
387             /**************************
388              * CALCULATE INTERACTIONS *
389              **************************/
390
391             r02              = _mm_mul_ps(rsq02,rinv02);
392
393             /* Calculate table index by multiplying r with table scale and truncate to integer */
394             rt               = _mm_mul_ps(r02,vftabscale);
395             vfitab           = _mm_cvttps_epi32(rt);
396             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
397             vfitab           = _mm_slli_epi32(vfitab,2);
398
399             /* CUBIC SPLINE TABLE ELECTROSTATICS */
400             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
401             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
402             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
403             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
404             _MM_TRANSPOSE4_PS(Y,F,G,H);
405             Heps             = _mm_mul_ps(vfeps,H);
406             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
407             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
408             velec            = _mm_mul_ps(qq02,VV);
409             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
410             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
411
412             /* Update potential sum for this i atom from the interaction with this j atom. */
413             velecsum         = _mm_add_ps(velecsum,velec);
414
415             fscal            = felec;
416
417             /* Calculate temporary vectorial force */
418             tx               = _mm_mul_ps(fscal,dx02);
419             ty               = _mm_mul_ps(fscal,dy02);
420             tz               = _mm_mul_ps(fscal,dz02);
421
422             /* Update vectorial force */
423             fix0             = _mm_add_ps(fix0,tx);
424             fiy0             = _mm_add_ps(fiy0,ty);
425             fiz0             = _mm_add_ps(fiz0,tz);
426
427             fjx2             = _mm_add_ps(fjx2,tx);
428             fjy2             = _mm_add_ps(fjy2,ty);
429             fjz2             = _mm_add_ps(fjz2,tz);
430             
431             /**************************
432              * CALCULATE INTERACTIONS *
433              **************************/
434
435             r10              = _mm_mul_ps(rsq10,rinv10);
436
437             /* Calculate table index by multiplying r with table scale and truncate to integer */
438             rt               = _mm_mul_ps(r10,vftabscale);
439             vfitab           = _mm_cvttps_epi32(rt);
440             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
441             vfitab           = _mm_slli_epi32(vfitab,2);
442
443             /* CUBIC SPLINE TABLE ELECTROSTATICS */
444             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
445             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
446             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
447             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
448             _MM_TRANSPOSE4_PS(Y,F,G,H);
449             Heps             = _mm_mul_ps(vfeps,H);
450             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
451             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
452             velec            = _mm_mul_ps(qq10,VV);
453             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
454             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
455
456             /* Update potential sum for this i atom from the interaction with this j atom. */
457             velecsum         = _mm_add_ps(velecsum,velec);
458
459             fscal            = felec;
460
461             /* Calculate temporary vectorial force */
462             tx               = _mm_mul_ps(fscal,dx10);
463             ty               = _mm_mul_ps(fscal,dy10);
464             tz               = _mm_mul_ps(fscal,dz10);
465
466             /* Update vectorial force */
467             fix1             = _mm_add_ps(fix1,tx);
468             fiy1             = _mm_add_ps(fiy1,ty);
469             fiz1             = _mm_add_ps(fiz1,tz);
470
471             fjx0             = _mm_add_ps(fjx0,tx);
472             fjy0             = _mm_add_ps(fjy0,ty);
473             fjz0             = _mm_add_ps(fjz0,tz);
474             
475             /**************************
476              * CALCULATE INTERACTIONS *
477              **************************/
478
479             r11              = _mm_mul_ps(rsq11,rinv11);
480
481             /* Calculate table index by multiplying r with table scale and truncate to integer */
482             rt               = _mm_mul_ps(r11,vftabscale);
483             vfitab           = _mm_cvttps_epi32(rt);
484             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
485             vfitab           = _mm_slli_epi32(vfitab,2);
486
487             /* CUBIC SPLINE TABLE ELECTROSTATICS */
488             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
489             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
490             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
491             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
492             _MM_TRANSPOSE4_PS(Y,F,G,H);
493             Heps             = _mm_mul_ps(vfeps,H);
494             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
495             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
496             velec            = _mm_mul_ps(qq11,VV);
497             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
498             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
499
500             /* Update potential sum for this i atom from the interaction with this j atom. */
501             velecsum         = _mm_add_ps(velecsum,velec);
502
503             fscal            = felec;
504
505             /* Calculate temporary vectorial force */
506             tx               = _mm_mul_ps(fscal,dx11);
507             ty               = _mm_mul_ps(fscal,dy11);
508             tz               = _mm_mul_ps(fscal,dz11);
509
510             /* Update vectorial force */
511             fix1             = _mm_add_ps(fix1,tx);
512             fiy1             = _mm_add_ps(fiy1,ty);
513             fiz1             = _mm_add_ps(fiz1,tz);
514
515             fjx1             = _mm_add_ps(fjx1,tx);
516             fjy1             = _mm_add_ps(fjy1,ty);
517             fjz1             = _mm_add_ps(fjz1,tz);
518             
519             /**************************
520              * CALCULATE INTERACTIONS *
521              **************************/
522
523             r12              = _mm_mul_ps(rsq12,rinv12);
524
525             /* Calculate table index by multiplying r with table scale and truncate to integer */
526             rt               = _mm_mul_ps(r12,vftabscale);
527             vfitab           = _mm_cvttps_epi32(rt);
528             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
529             vfitab           = _mm_slli_epi32(vfitab,2);
530
531             /* CUBIC SPLINE TABLE ELECTROSTATICS */
532             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
533             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
534             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
535             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
536             _MM_TRANSPOSE4_PS(Y,F,G,H);
537             Heps             = _mm_mul_ps(vfeps,H);
538             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
539             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
540             velec            = _mm_mul_ps(qq12,VV);
541             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
542             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
543
544             /* Update potential sum for this i atom from the interaction with this j atom. */
545             velecsum         = _mm_add_ps(velecsum,velec);
546
547             fscal            = felec;
548
549             /* Calculate temporary vectorial force */
550             tx               = _mm_mul_ps(fscal,dx12);
551             ty               = _mm_mul_ps(fscal,dy12);
552             tz               = _mm_mul_ps(fscal,dz12);
553
554             /* Update vectorial force */
555             fix1             = _mm_add_ps(fix1,tx);
556             fiy1             = _mm_add_ps(fiy1,ty);
557             fiz1             = _mm_add_ps(fiz1,tz);
558
559             fjx2             = _mm_add_ps(fjx2,tx);
560             fjy2             = _mm_add_ps(fjy2,ty);
561             fjz2             = _mm_add_ps(fjz2,tz);
562             
563             /**************************
564              * CALCULATE INTERACTIONS *
565              **************************/
566
567             r20              = _mm_mul_ps(rsq20,rinv20);
568
569             /* Calculate table index by multiplying r with table scale and truncate to integer */
570             rt               = _mm_mul_ps(r20,vftabscale);
571             vfitab           = _mm_cvttps_epi32(rt);
572             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
573             vfitab           = _mm_slli_epi32(vfitab,2);
574
575             /* CUBIC SPLINE TABLE ELECTROSTATICS */
576             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
577             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
578             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
579             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
580             _MM_TRANSPOSE4_PS(Y,F,G,H);
581             Heps             = _mm_mul_ps(vfeps,H);
582             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
583             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
584             velec            = _mm_mul_ps(qq20,VV);
585             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
586             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
587
588             /* Update potential sum for this i atom from the interaction with this j atom. */
589             velecsum         = _mm_add_ps(velecsum,velec);
590
591             fscal            = felec;
592
593             /* Calculate temporary vectorial force */
594             tx               = _mm_mul_ps(fscal,dx20);
595             ty               = _mm_mul_ps(fscal,dy20);
596             tz               = _mm_mul_ps(fscal,dz20);
597
598             /* Update vectorial force */
599             fix2             = _mm_add_ps(fix2,tx);
600             fiy2             = _mm_add_ps(fiy2,ty);
601             fiz2             = _mm_add_ps(fiz2,tz);
602
603             fjx0             = _mm_add_ps(fjx0,tx);
604             fjy0             = _mm_add_ps(fjy0,ty);
605             fjz0             = _mm_add_ps(fjz0,tz);
606             
607             /**************************
608              * CALCULATE INTERACTIONS *
609              **************************/
610
611             r21              = _mm_mul_ps(rsq21,rinv21);
612
613             /* Calculate table index by multiplying r with table scale and truncate to integer */
614             rt               = _mm_mul_ps(r21,vftabscale);
615             vfitab           = _mm_cvttps_epi32(rt);
616             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
617             vfitab           = _mm_slli_epi32(vfitab,2);
618
619             /* CUBIC SPLINE TABLE ELECTROSTATICS */
620             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
621             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
622             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
623             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
624             _MM_TRANSPOSE4_PS(Y,F,G,H);
625             Heps             = _mm_mul_ps(vfeps,H);
626             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
627             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
628             velec            = _mm_mul_ps(qq21,VV);
629             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
630             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
631
632             /* Update potential sum for this i atom from the interaction with this j atom. */
633             velecsum         = _mm_add_ps(velecsum,velec);
634
635             fscal            = felec;
636
637             /* Calculate temporary vectorial force */
638             tx               = _mm_mul_ps(fscal,dx21);
639             ty               = _mm_mul_ps(fscal,dy21);
640             tz               = _mm_mul_ps(fscal,dz21);
641
642             /* Update vectorial force */
643             fix2             = _mm_add_ps(fix2,tx);
644             fiy2             = _mm_add_ps(fiy2,ty);
645             fiz2             = _mm_add_ps(fiz2,tz);
646
647             fjx1             = _mm_add_ps(fjx1,tx);
648             fjy1             = _mm_add_ps(fjy1,ty);
649             fjz1             = _mm_add_ps(fjz1,tz);
650             
651             /**************************
652              * CALCULATE INTERACTIONS *
653              **************************/
654
655             r22              = _mm_mul_ps(rsq22,rinv22);
656
657             /* Calculate table index by multiplying r with table scale and truncate to integer */
658             rt               = _mm_mul_ps(r22,vftabscale);
659             vfitab           = _mm_cvttps_epi32(rt);
660             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
661             vfitab           = _mm_slli_epi32(vfitab,2);
662
663             /* CUBIC SPLINE TABLE ELECTROSTATICS */
664             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
665             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
666             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
667             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
668             _MM_TRANSPOSE4_PS(Y,F,G,H);
669             Heps             = _mm_mul_ps(vfeps,H);
670             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
671             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
672             velec            = _mm_mul_ps(qq22,VV);
673             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
674             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
675
676             /* Update potential sum for this i atom from the interaction with this j atom. */
677             velecsum         = _mm_add_ps(velecsum,velec);
678
679             fscal            = felec;
680
681             /* Calculate temporary vectorial force */
682             tx               = _mm_mul_ps(fscal,dx22);
683             ty               = _mm_mul_ps(fscal,dy22);
684             tz               = _mm_mul_ps(fscal,dz22);
685
686             /* Update vectorial force */
687             fix2             = _mm_add_ps(fix2,tx);
688             fiy2             = _mm_add_ps(fiy2,ty);
689             fiz2             = _mm_add_ps(fiz2,tz);
690
691             fjx2             = _mm_add_ps(fjx2,tx);
692             fjy2             = _mm_add_ps(fjy2,ty);
693             fjz2             = _mm_add_ps(fjz2,tz);
694             
695             fjptrA             = f+j_coord_offsetA;
696             fjptrB             = f+j_coord_offsetB;
697             fjptrC             = f+j_coord_offsetC;
698             fjptrD             = f+j_coord_offsetD;
699
700             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
701                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
702
703             /* Inner loop uses 400 flops */
704         }
705
706         if(jidx<j_index_end)
707         {
708
709             /* Get j neighbor index, and coordinate index */
710             jnrlistA         = jjnr[jidx];
711             jnrlistB         = jjnr[jidx+1];
712             jnrlistC         = jjnr[jidx+2];
713             jnrlistD         = jjnr[jidx+3];
714             /* Sign of each element will be negative for non-real atoms.
715              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
716              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
717              */
718             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
719             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
720             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
721             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
722             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
723             j_coord_offsetA  = DIM*jnrA;
724             j_coord_offsetB  = DIM*jnrB;
725             j_coord_offsetC  = DIM*jnrC;
726             j_coord_offsetD  = DIM*jnrD;
727
728             /* load j atom coordinates */
729             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
730                                               x+j_coord_offsetC,x+j_coord_offsetD,
731                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
732
733             /* Calculate displacement vector */
734             dx00             = _mm_sub_ps(ix0,jx0);
735             dy00             = _mm_sub_ps(iy0,jy0);
736             dz00             = _mm_sub_ps(iz0,jz0);
737             dx01             = _mm_sub_ps(ix0,jx1);
738             dy01             = _mm_sub_ps(iy0,jy1);
739             dz01             = _mm_sub_ps(iz0,jz1);
740             dx02             = _mm_sub_ps(ix0,jx2);
741             dy02             = _mm_sub_ps(iy0,jy2);
742             dz02             = _mm_sub_ps(iz0,jz2);
743             dx10             = _mm_sub_ps(ix1,jx0);
744             dy10             = _mm_sub_ps(iy1,jy0);
745             dz10             = _mm_sub_ps(iz1,jz0);
746             dx11             = _mm_sub_ps(ix1,jx1);
747             dy11             = _mm_sub_ps(iy1,jy1);
748             dz11             = _mm_sub_ps(iz1,jz1);
749             dx12             = _mm_sub_ps(ix1,jx2);
750             dy12             = _mm_sub_ps(iy1,jy2);
751             dz12             = _mm_sub_ps(iz1,jz2);
752             dx20             = _mm_sub_ps(ix2,jx0);
753             dy20             = _mm_sub_ps(iy2,jy0);
754             dz20             = _mm_sub_ps(iz2,jz0);
755             dx21             = _mm_sub_ps(ix2,jx1);
756             dy21             = _mm_sub_ps(iy2,jy1);
757             dz21             = _mm_sub_ps(iz2,jz1);
758             dx22             = _mm_sub_ps(ix2,jx2);
759             dy22             = _mm_sub_ps(iy2,jy2);
760             dz22             = _mm_sub_ps(iz2,jz2);
761
762             /* Calculate squared distance and things based on it */
763             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
764             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
765             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
766             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
767             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
768             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
769             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
770             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
771             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
772
773             rinv00           = sse2_invsqrt_f(rsq00);
774             rinv01           = sse2_invsqrt_f(rsq01);
775             rinv02           = sse2_invsqrt_f(rsq02);
776             rinv10           = sse2_invsqrt_f(rsq10);
777             rinv11           = sse2_invsqrt_f(rsq11);
778             rinv12           = sse2_invsqrt_f(rsq12);
779             rinv20           = sse2_invsqrt_f(rsq20);
780             rinv21           = sse2_invsqrt_f(rsq21);
781             rinv22           = sse2_invsqrt_f(rsq22);
782
783             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
784
785             fjx0             = _mm_setzero_ps();
786             fjy0             = _mm_setzero_ps();
787             fjz0             = _mm_setzero_ps();
788             fjx1             = _mm_setzero_ps();
789             fjy1             = _mm_setzero_ps();
790             fjz1             = _mm_setzero_ps();
791             fjx2             = _mm_setzero_ps();
792             fjy2             = _mm_setzero_ps();
793             fjz2             = _mm_setzero_ps();
794
795             /**************************
796              * CALCULATE INTERACTIONS *
797              **************************/
798
799             r00              = _mm_mul_ps(rsq00,rinv00);
800             r00              = _mm_andnot_ps(dummy_mask,r00);
801
802             /* Calculate table index by multiplying r with table scale and truncate to integer */
803             rt               = _mm_mul_ps(r00,vftabscale);
804             vfitab           = _mm_cvttps_epi32(rt);
805             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
806             vfitab           = _mm_slli_epi32(vfitab,2);
807
808             /* CUBIC SPLINE TABLE ELECTROSTATICS */
809             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
810             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
811             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
812             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
813             _MM_TRANSPOSE4_PS(Y,F,G,H);
814             Heps             = _mm_mul_ps(vfeps,H);
815             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
816             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
817             velec            = _mm_mul_ps(qq00,VV);
818             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
819             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
820
821             /* LENNARD-JONES DISPERSION/REPULSION */
822
823             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
824             vvdw6            = _mm_mul_ps(c6_00,rinvsix);
825             vvdw12           = _mm_mul_ps(c12_00,_mm_mul_ps(rinvsix,rinvsix));
826             vvdw             = _mm_sub_ps( _mm_mul_ps(vvdw12,one_twelfth) , _mm_mul_ps(vvdw6,one_sixth) );
827             fvdw             = _mm_mul_ps(_mm_sub_ps(vvdw12,vvdw6),rinvsq00);
828
829             /* Update potential sum for this i atom from the interaction with this j atom. */
830             velec            = _mm_andnot_ps(dummy_mask,velec);
831             velecsum         = _mm_add_ps(velecsum,velec);
832             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
833             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
834
835             fscal            = _mm_add_ps(felec,fvdw);
836
837             fscal            = _mm_andnot_ps(dummy_mask,fscal);
838
839             /* Calculate temporary vectorial force */
840             tx               = _mm_mul_ps(fscal,dx00);
841             ty               = _mm_mul_ps(fscal,dy00);
842             tz               = _mm_mul_ps(fscal,dz00);
843
844             /* Update vectorial force */
845             fix0             = _mm_add_ps(fix0,tx);
846             fiy0             = _mm_add_ps(fiy0,ty);
847             fiz0             = _mm_add_ps(fiz0,tz);
848
849             fjx0             = _mm_add_ps(fjx0,tx);
850             fjy0             = _mm_add_ps(fjy0,ty);
851             fjz0             = _mm_add_ps(fjz0,tz);
852             
853             /**************************
854              * CALCULATE INTERACTIONS *
855              **************************/
856
857             r01              = _mm_mul_ps(rsq01,rinv01);
858             r01              = _mm_andnot_ps(dummy_mask,r01);
859
860             /* Calculate table index by multiplying r with table scale and truncate to integer */
861             rt               = _mm_mul_ps(r01,vftabscale);
862             vfitab           = _mm_cvttps_epi32(rt);
863             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
864             vfitab           = _mm_slli_epi32(vfitab,2);
865
866             /* CUBIC SPLINE TABLE ELECTROSTATICS */
867             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
868             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
869             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
870             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
871             _MM_TRANSPOSE4_PS(Y,F,G,H);
872             Heps             = _mm_mul_ps(vfeps,H);
873             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
874             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
875             velec            = _mm_mul_ps(qq01,VV);
876             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
877             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
878
879             /* Update potential sum for this i atom from the interaction with this j atom. */
880             velec            = _mm_andnot_ps(dummy_mask,velec);
881             velecsum         = _mm_add_ps(velecsum,velec);
882
883             fscal            = felec;
884
885             fscal            = _mm_andnot_ps(dummy_mask,fscal);
886
887             /* Calculate temporary vectorial force */
888             tx               = _mm_mul_ps(fscal,dx01);
889             ty               = _mm_mul_ps(fscal,dy01);
890             tz               = _mm_mul_ps(fscal,dz01);
891
892             /* Update vectorial force */
893             fix0             = _mm_add_ps(fix0,tx);
894             fiy0             = _mm_add_ps(fiy0,ty);
895             fiz0             = _mm_add_ps(fiz0,tz);
896
897             fjx1             = _mm_add_ps(fjx1,tx);
898             fjy1             = _mm_add_ps(fjy1,ty);
899             fjz1             = _mm_add_ps(fjz1,tz);
900             
901             /**************************
902              * CALCULATE INTERACTIONS *
903              **************************/
904
905             r02              = _mm_mul_ps(rsq02,rinv02);
906             r02              = _mm_andnot_ps(dummy_mask,r02);
907
908             /* Calculate table index by multiplying r with table scale and truncate to integer */
909             rt               = _mm_mul_ps(r02,vftabscale);
910             vfitab           = _mm_cvttps_epi32(rt);
911             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
912             vfitab           = _mm_slli_epi32(vfitab,2);
913
914             /* CUBIC SPLINE TABLE ELECTROSTATICS */
915             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
916             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
917             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
918             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
919             _MM_TRANSPOSE4_PS(Y,F,G,H);
920             Heps             = _mm_mul_ps(vfeps,H);
921             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
922             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
923             velec            = _mm_mul_ps(qq02,VV);
924             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
925             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
926
927             /* Update potential sum for this i atom from the interaction with this j atom. */
928             velec            = _mm_andnot_ps(dummy_mask,velec);
929             velecsum         = _mm_add_ps(velecsum,velec);
930
931             fscal            = felec;
932
933             fscal            = _mm_andnot_ps(dummy_mask,fscal);
934
935             /* Calculate temporary vectorial force */
936             tx               = _mm_mul_ps(fscal,dx02);
937             ty               = _mm_mul_ps(fscal,dy02);
938             tz               = _mm_mul_ps(fscal,dz02);
939
940             /* Update vectorial force */
941             fix0             = _mm_add_ps(fix0,tx);
942             fiy0             = _mm_add_ps(fiy0,ty);
943             fiz0             = _mm_add_ps(fiz0,tz);
944
945             fjx2             = _mm_add_ps(fjx2,tx);
946             fjy2             = _mm_add_ps(fjy2,ty);
947             fjz2             = _mm_add_ps(fjz2,tz);
948             
949             /**************************
950              * CALCULATE INTERACTIONS *
951              **************************/
952
953             r10              = _mm_mul_ps(rsq10,rinv10);
954             r10              = _mm_andnot_ps(dummy_mask,r10);
955
956             /* Calculate table index by multiplying r with table scale and truncate to integer */
957             rt               = _mm_mul_ps(r10,vftabscale);
958             vfitab           = _mm_cvttps_epi32(rt);
959             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
960             vfitab           = _mm_slli_epi32(vfitab,2);
961
962             /* CUBIC SPLINE TABLE ELECTROSTATICS */
963             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
964             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
965             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
966             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
967             _MM_TRANSPOSE4_PS(Y,F,G,H);
968             Heps             = _mm_mul_ps(vfeps,H);
969             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
970             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
971             velec            = _mm_mul_ps(qq10,VV);
972             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
973             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
974
975             /* Update potential sum for this i atom from the interaction with this j atom. */
976             velec            = _mm_andnot_ps(dummy_mask,velec);
977             velecsum         = _mm_add_ps(velecsum,velec);
978
979             fscal            = felec;
980
981             fscal            = _mm_andnot_ps(dummy_mask,fscal);
982
983             /* Calculate temporary vectorial force */
984             tx               = _mm_mul_ps(fscal,dx10);
985             ty               = _mm_mul_ps(fscal,dy10);
986             tz               = _mm_mul_ps(fscal,dz10);
987
988             /* Update vectorial force */
989             fix1             = _mm_add_ps(fix1,tx);
990             fiy1             = _mm_add_ps(fiy1,ty);
991             fiz1             = _mm_add_ps(fiz1,tz);
992
993             fjx0             = _mm_add_ps(fjx0,tx);
994             fjy0             = _mm_add_ps(fjy0,ty);
995             fjz0             = _mm_add_ps(fjz0,tz);
996             
997             /**************************
998              * CALCULATE INTERACTIONS *
999              **************************/
1000
1001             r11              = _mm_mul_ps(rsq11,rinv11);
1002             r11              = _mm_andnot_ps(dummy_mask,r11);
1003
1004             /* Calculate table index by multiplying r with table scale and truncate to integer */
1005             rt               = _mm_mul_ps(r11,vftabscale);
1006             vfitab           = _mm_cvttps_epi32(rt);
1007             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1008             vfitab           = _mm_slli_epi32(vfitab,2);
1009
1010             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1011             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1012             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1013             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1014             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1015             _MM_TRANSPOSE4_PS(Y,F,G,H);
1016             Heps             = _mm_mul_ps(vfeps,H);
1017             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1018             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1019             velec            = _mm_mul_ps(qq11,VV);
1020             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1021             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1022
1023             /* Update potential sum for this i atom from the interaction with this j atom. */
1024             velec            = _mm_andnot_ps(dummy_mask,velec);
1025             velecsum         = _mm_add_ps(velecsum,velec);
1026
1027             fscal            = felec;
1028
1029             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1030
1031             /* Calculate temporary vectorial force */
1032             tx               = _mm_mul_ps(fscal,dx11);
1033             ty               = _mm_mul_ps(fscal,dy11);
1034             tz               = _mm_mul_ps(fscal,dz11);
1035
1036             /* Update vectorial force */
1037             fix1             = _mm_add_ps(fix1,tx);
1038             fiy1             = _mm_add_ps(fiy1,ty);
1039             fiz1             = _mm_add_ps(fiz1,tz);
1040
1041             fjx1             = _mm_add_ps(fjx1,tx);
1042             fjy1             = _mm_add_ps(fjy1,ty);
1043             fjz1             = _mm_add_ps(fjz1,tz);
1044             
1045             /**************************
1046              * CALCULATE INTERACTIONS *
1047              **************************/
1048
1049             r12              = _mm_mul_ps(rsq12,rinv12);
1050             r12              = _mm_andnot_ps(dummy_mask,r12);
1051
1052             /* Calculate table index by multiplying r with table scale and truncate to integer */
1053             rt               = _mm_mul_ps(r12,vftabscale);
1054             vfitab           = _mm_cvttps_epi32(rt);
1055             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1056             vfitab           = _mm_slli_epi32(vfitab,2);
1057
1058             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1059             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1060             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1061             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1062             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1063             _MM_TRANSPOSE4_PS(Y,F,G,H);
1064             Heps             = _mm_mul_ps(vfeps,H);
1065             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1066             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1067             velec            = _mm_mul_ps(qq12,VV);
1068             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1069             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1070
1071             /* Update potential sum for this i atom from the interaction with this j atom. */
1072             velec            = _mm_andnot_ps(dummy_mask,velec);
1073             velecsum         = _mm_add_ps(velecsum,velec);
1074
1075             fscal            = felec;
1076
1077             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1078
1079             /* Calculate temporary vectorial force */
1080             tx               = _mm_mul_ps(fscal,dx12);
1081             ty               = _mm_mul_ps(fscal,dy12);
1082             tz               = _mm_mul_ps(fscal,dz12);
1083
1084             /* Update vectorial force */
1085             fix1             = _mm_add_ps(fix1,tx);
1086             fiy1             = _mm_add_ps(fiy1,ty);
1087             fiz1             = _mm_add_ps(fiz1,tz);
1088
1089             fjx2             = _mm_add_ps(fjx2,tx);
1090             fjy2             = _mm_add_ps(fjy2,ty);
1091             fjz2             = _mm_add_ps(fjz2,tz);
1092             
1093             /**************************
1094              * CALCULATE INTERACTIONS *
1095              **************************/
1096
1097             r20              = _mm_mul_ps(rsq20,rinv20);
1098             r20              = _mm_andnot_ps(dummy_mask,r20);
1099
1100             /* Calculate table index by multiplying r with table scale and truncate to integer */
1101             rt               = _mm_mul_ps(r20,vftabscale);
1102             vfitab           = _mm_cvttps_epi32(rt);
1103             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1104             vfitab           = _mm_slli_epi32(vfitab,2);
1105
1106             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1107             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1108             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1109             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1110             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1111             _MM_TRANSPOSE4_PS(Y,F,G,H);
1112             Heps             = _mm_mul_ps(vfeps,H);
1113             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1114             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1115             velec            = _mm_mul_ps(qq20,VV);
1116             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1117             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1118
1119             /* Update potential sum for this i atom from the interaction with this j atom. */
1120             velec            = _mm_andnot_ps(dummy_mask,velec);
1121             velecsum         = _mm_add_ps(velecsum,velec);
1122
1123             fscal            = felec;
1124
1125             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1126
1127             /* Calculate temporary vectorial force */
1128             tx               = _mm_mul_ps(fscal,dx20);
1129             ty               = _mm_mul_ps(fscal,dy20);
1130             tz               = _mm_mul_ps(fscal,dz20);
1131
1132             /* Update vectorial force */
1133             fix2             = _mm_add_ps(fix2,tx);
1134             fiy2             = _mm_add_ps(fiy2,ty);
1135             fiz2             = _mm_add_ps(fiz2,tz);
1136
1137             fjx0             = _mm_add_ps(fjx0,tx);
1138             fjy0             = _mm_add_ps(fjy0,ty);
1139             fjz0             = _mm_add_ps(fjz0,tz);
1140             
1141             /**************************
1142              * CALCULATE INTERACTIONS *
1143              **************************/
1144
1145             r21              = _mm_mul_ps(rsq21,rinv21);
1146             r21              = _mm_andnot_ps(dummy_mask,r21);
1147
1148             /* Calculate table index by multiplying r with table scale and truncate to integer */
1149             rt               = _mm_mul_ps(r21,vftabscale);
1150             vfitab           = _mm_cvttps_epi32(rt);
1151             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1152             vfitab           = _mm_slli_epi32(vfitab,2);
1153
1154             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1155             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1156             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1157             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1158             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1159             _MM_TRANSPOSE4_PS(Y,F,G,H);
1160             Heps             = _mm_mul_ps(vfeps,H);
1161             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1162             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1163             velec            = _mm_mul_ps(qq21,VV);
1164             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1165             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1166
1167             /* Update potential sum for this i atom from the interaction with this j atom. */
1168             velec            = _mm_andnot_ps(dummy_mask,velec);
1169             velecsum         = _mm_add_ps(velecsum,velec);
1170
1171             fscal            = felec;
1172
1173             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1174
1175             /* Calculate temporary vectorial force */
1176             tx               = _mm_mul_ps(fscal,dx21);
1177             ty               = _mm_mul_ps(fscal,dy21);
1178             tz               = _mm_mul_ps(fscal,dz21);
1179
1180             /* Update vectorial force */
1181             fix2             = _mm_add_ps(fix2,tx);
1182             fiy2             = _mm_add_ps(fiy2,ty);
1183             fiz2             = _mm_add_ps(fiz2,tz);
1184
1185             fjx1             = _mm_add_ps(fjx1,tx);
1186             fjy1             = _mm_add_ps(fjy1,ty);
1187             fjz1             = _mm_add_ps(fjz1,tz);
1188             
1189             /**************************
1190              * CALCULATE INTERACTIONS *
1191              **************************/
1192
1193             r22              = _mm_mul_ps(rsq22,rinv22);
1194             r22              = _mm_andnot_ps(dummy_mask,r22);
1195
1196             /* Calculate table index by multiplying r with table scale and truncate to integer */
1197             rt               = _mm_mul_ps(r22,vftabscale);
1198             vfitab           = _mm_cvttps_epi32(rt);
1199             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1200             vfitab           = _mm_slli_epi32(vfitab,2);
1201
1202             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1203             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1204             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1205             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1206             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1207             _MM_TRANSPOSE4_PS(Y,F,G,H);
1208             Heps             = _mm_mul_ps(vfeps,H);
1209             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1210             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1211             velec            = _mm_mul_ps(qq22,VV);
1212             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1213             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1214
1215             /* Update potential sum for this i atom from the interaction with this j atom. */
1216             velec            = _mm_andnot_ps(dummy_mask,velec);
1217             velecsum         = _mm_add_ps(velecsum,velec);
1218
1219             fscal            = felec;
1220
1221             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1222
1223             /* Calculate temporary vectorial force */
1224             tx               = _mm_mul_ps(fscal,dx22);
1225             ty               = _mm_mul_ps(fscal,dy22);
1226             tz               = _mm_mul_ps(fscal,dz22);
1227
1228             /* Update vectorial force */
1229             fix2             = _mm_add_ps(fix2,tx);
1230             fiy2             = _mm_add_ps(fiy2,ty);
1231             fiz2             = _mm_add_ps(fiz2,tz);
1232
1233             fjx2             = _mm_add_ps(fjx2,tx);
1234             fjy2             = _mm_add_ps(fjy2,ty);
1235             fjz2             = _mm_add_ps(fjz2,tz);
1236             
1237             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1238             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1239             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1240             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1241
1242             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1243                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1244
1245             /* Inner loop uses 409 flops */
1246         }
1247
1248         /* End of innermost loop */
1249
1250         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1251                                               f+i_coord_offset,fshift+i_shift_offset);
1252
1253         ggid                        = gid[iidx];
1254         /* Update potential energies */
1255         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1256         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1257
1258         /* Increment number of inner iterations */
1259         inneriter                  += j_index_end - j_index_start;
1260
1261         /* Outer loop uses 20 flops */
1262     }
1263
1264     /* Increment number of outer iterations */
1265     outeriter        += nri;
1266
1267     /* Update outer/inner flops */
1268
1269     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
1270 }
1271 /*
1272  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_single
1273  * Electrostatics interaction: CubicSplineTable
1274  * VdW interaction:            LennardJones
1275  * Geometry:                   Water3-Water3
1276  * Calculate force/pot:        Force
1277  */
1278 void
1279 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_sse2_single
1280                     (t_nblist                    * gmx_restrict       nlist,
1281                      rvec                        * gmx_restrict          xx,
1282                      rvec                        * gmx_restrict          ff,
1283                      struct t_forcerec           * gmx_restrict          fr,
1284                      t_mdatoms                   * gmx_restrict     mdatoms,
1285                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1286                      t_nrnb                      * gmx_restrict        nrnb)
1287 {
1288     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1289      * just 0 for non-waters.
1290      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1291      * jnr indices corresponding to data put in the four positions in the SIMD register.
1292      */
1293     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1294     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1295     int              jnrA,jnrB,jnrC,jnrD;
1296     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1297     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1298     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1299     real             rcutoff_scalar;
1300     real             *shiftvec,*fshift,*x,*f;
1301     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1302     real             scratch[4*DIM];
1303     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1304     int              vdwioffset0;
1305     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1306     int              vdwioffset1;
1307     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1308     int              vdwioffset2;
1309     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1310     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1311     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1312     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1313     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1314     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1315     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1316     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1317     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1318     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1319     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1320     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1321     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1322     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1323     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1324     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1325     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1326     real             *charge;
1327     int              nvdwtype;
1328     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1329     int              *vdwtype;
1330     real             *vdwparam;
1331     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1332     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1333     __m128i          vfitab;
1334     __m128i          ifour       = _mm_set1_epi32(4);
1335     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1336     real             *vftab;
1337     __m128           dummy_mask,cutoff_mask;
1338     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1339     __m128           one     = _mm_set1_ps(1.0);
1340     __m128           two     = _mm_set1_ps(2.0);
1341     x                = xx[0];
1342     f                = ff[0];
1343
1344     nri              = nlist->nri;
1345     iinr             = nlist->iinr;
1346     jindex           = nlist->jindex;
1347     jjnr             = nlist->jjnr;
1348     shiftidx         = nlist->shift;
1349     gid              = nlist->gid;
1350     shiftvec         = fr->shift_vec[0];
1351     fshift           = fr->fshift[0];
1352     facel            = _mm_set1_ps(fr->ic->epsfac);
1353     charge           = mdatoms->chargeA;
1354     nvdwtype         = fr->ntype;
1355     vdwparam         = fr->nbfp;
1356     vdwtype          = mdatoms->typeA;
1357
1358     vftab            = kernel_data->table_elec->data;
1359     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
1360
1361     /* Setup water-specific parameters */
1362     inr              = nlist->iinr[0];
1363     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1364     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1365     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1366     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1367
1368     jq0              = _mm_set1_ps(charge[inr+0]);
1369     jq1              = _mm_set1_ps(charge[inr+1]);
1370     jq2              = _mm_set1_ps(charge[inr+2]);
1371     vdwjidx0A        = 2*vdwtype[inr+0];
1372     qq00             = _mm_mul_ps(iq0,jq0);
1373     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1374     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1375     qq01             = _mm_mul_ps(iq0,jq1);
1376     qq02             = _mm_mul_ps(iq0,jq2);
1377     qq10             = _mm_mul_ps(iq1,jq0);
1378     qq11             = _mm_mul_ps(iq1,jq1);
1379     qq12             = _mm_mul_ps(iq1,jq2);
1380     qq20             = _mm_mul_ps(iq2,jq0);
1381     qq21             = _mm_mul_ps(iq2,jq1);
1382     qq22             = _mm_mul_ps(iq2,jq2);
1383
1384     /* Avoid stupid compiler warnings */
1385     jnrA = jnrB = jnrC = jnrD = 0;
1386     j_coord_offsetA = 0;
1387     j_coord_offsetB = 0;
1388     j_coord_offsetC = 0;
1389     j_coord_offsetD = 0;
1390
1391     outeriter        = 0;
1392     inneriter        = 0;
1393
1394     for(iidx=0;iidx<4*DIM;iidx++)
1395     {
1396         scratch[iidx] = 0.0;
1397     }  
1398
1399     /* Start outer loop over neighborlists */
1400     for(iidx=0; iidx<nri; iidx++)
1401     {
1402         /* Load shift vector for this list */
1403         i_shift_offset   = DIM*shiftidx[iidx];
1404
1405         /* Load limits for loop over neighbors */
1406         j_index_start    = jindex[iidx];
1407         j_index_end      = jindex[iidx+1];
1408
1409         /* Get outer coordinate index */
1410         inr              = iinr[iidx];
1411         i_coord_offset   = DIM*inr;
1412
1413         /* Load i particle coords and add shift vector */
1414         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1415                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1416         
1417         fix0             = _mm_setzero_ps();
1418         fiy0             = _mm_setzero_ps();
1419         fiz0             = _mm_setzero_ps();
1420         fix1             = _mm_setzero_ps();
1421         fiy1             = _mm_setzero_ps();
1422         fiz1             = _mm_setzero_ps();
1423         fix2             = _mm_setzero_ps();
1424         fiy2             = _mm_setzero_ps();
1425         fiz2             = _mm_setzero_ps();
1426
1427         /* Start inner kernel loop */
1428         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1429         {
1430
1431             /* Get j neighbor index, and coordinate index */
1432             jnrA             = jjnr[jidx];
1433             jnrB             = jjnr[jidx+1];
1434             jnrC             = jjnr[jidx+2];
1435             jnrD             = jjnr[jidx+3];
1436             j_coord_offsetA  = DIM*jnrA;
1437             j_coord_offsetB  = DIM*jnrB;
1438             j_coord_offsetC  = DIM*jnrC;
1439             j_coord_offsetD  = DIM*jnrD;
1440
1441             /* load j atom coordinates */
1442             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1443                                               x+j_coord_offsetC,x+j_coord_offsetD,
1444                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1445
1446             /* Calculate displacement vector */
1447             dx00             = _mm_sub_ps(ix0,jx0);
1448             dy00             = _mm_sub_ps(iy0,jy0);
1449             dz00             = _mm_sub_ps(iz0,jz0);
1450             dx01             = _mm_sub_ps(ix0,jx1);
1451             dy01             = _mm_sub_ps(iy0,jy1);
1452             dz01             = _mm_sub_ps(iz0,jz1);
1453             dx02             = _mm_sub_ps(ix0,jx2);
1454             dy02             = _mm_sub_ps(iy0,jy2);
1455             dz02             = _mm_sub_ps(iz0,jz2);
1456             dx10             = _mm_sub_ps(ix1,jx0);
1457             dy10             = _mm_sub_ps(iy1,jy0);
1458             dz10             = _mm_sub_ps(iz1,jz0);
1459             dx11             = _mm_sub_ps(ix1,jx1);
1460             dy11             = _mm_sub_ps(iy1,jy1);
1461             dz11             = _mm_sub_ps(iz1,jz1);
1462             dx12             = _mm_sub_ps(ix1,jx2);
1463             dy12             = _mm_sub_ps(iy1,jy2);
1464             dz12             = _mm_sub_ps(iz1,jz2);
1465             dx20             = _mm_sub_ps(ix2,jx0);
1466             dy20             = _mm_sub_ps(iy2,jy0);
1467             dz20             = _mm_sub_ps(iz2,jz0);
1468             dx21             = _mm_sub_ps(ix2,jx1);
1469             dy21             = _mm_sub_ps(iy2,jy1);
1470             dz21             = _mm_sub_ps(iz2,jz1);
1471             dx22             = _mm_sub_ps(ix2,jx2);
1472             dy22             = _mm_sub_ps(iy2,jy2);
1473             dz22             = _mm_sub_ps(iz2,jz2);
1474
1475             /* Calculate squared distance and things based on it */
1476             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1477             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1478             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1479             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1480             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1481             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1482             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1483             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1484             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1485
1486             rinv00           = sse2_invsqrt_f(rsq00);
1487             rinv01           = sse2_invsqrt_f(rsq01);
1488             rinv02           = sse2_invsqrt_f(rsq02);
1489             rinv10           = sse2_invsqrt_f(rsq10);
1490             rinv11           = sse2_invsqrt_f(rsq11);
1491             rinv12           = sse2_invsqrt_f(rsq12);
1492             rinv20           = sse2_invsqrt_f(rsq20);
1493             rinv21           = sse2_invsqrt_f(rsq21);
1494             rinv22           = sse2_invsqrt_f(rsq22);
1495
1496             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1497
1498             fjx0             = _mm_setzero_ps();
1499             fjy0             = _mm_setzero_ps();
1500             fjz0             = _mm_setzero_ps();
1501             fjx1             = _mm_setzero_ps();
1502             fjy1             = _mm_setzero_ps();
1503             fjz1             = _mm_setzero_ps();
1504             fjx2             = _mm_setzero_ps();
1505             fjy2             = _mm_setzero_ps();
1506             fjz2             = _mm_setzero_ps();
1507
1508             /**************************
1509              * CALCULATE INTERACTIONS *
1510              **************************/
1511
1512             r00              = _mm_mul_ps(rsq00,rinv00);
1513
1514             /* Calculate table index by multiplying r with table scale and truncate to integer */
1515             rt               = _mm_mul_ps(r00,vftabscale);
1516             vfitab           = _mm_cvttps_epi32(rt);
1517             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1518             vfitab           = _mm_slli_epi32(vfitab,2);
1519
1520             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1521             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1522             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1523             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1524             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1525             _MM_TRANSPOSE4_PS(Y,F,G,H);
1526             Heps             = _mm_mul_ps(vfeps,H);
1527             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1528             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1529             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1530
1531             /* LENNARD-JONES DISPERSION/REPULSION */
1532
1533             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1534             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1535
1536             fscal            = _mm_add_ps(felec,fvdw);
1537
1538             /* Calculate temporary vectorial force */
1539             tx               = _mm_mul_ps(fscal,dx00);
1540             ty               = _mm_mul_ps(fscal,dy00);
1541             tz               = _mm_mul_ps(fscal,dz00);
1542
1543             /* Update vectorial force */
1544             fix0             = _mm_add_ps(fix0,tx);
1545             fiy0             = _mm_add_ps(fiy0,ty);
1546             fiz0             = _mm_add_ps(fiz0,tz);
1547
1548             fjx0             = _mm_add_ps(fjx0,tx);
1549             fjy0             = _mm_add_ps(fjy0,ty);
1550             fjz0             = _mm_add_ps(fjz0,tz);
1551             
1552             /**************************
1553              * CALCULATE INTERACTIONS *
1554              **************************/
1555
1556             r01              = _mm_mul_ps(rsq01,rinv01);
1557
1558             /* Calculate table index by multiplying r with table scale and truncate to integer */
1559             rt               = _mm_mul_ps(r01,vftabscale);
1560             vfitab           = _mm_cvttps_epi32(rt);
1561             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1562             vfitab           = _mm_slli_epi32(vfitab,2);
1563
1564             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1565             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1566             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1567             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1568             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1569             _MM_TRANSPOSE4_PS(Y,F,G,H);
1570             Heps             = _mm_mul_ps(vfeps,H);
1571             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1572             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1573             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1574
1575             fscal            = felec;
1576
1577             /* Calculate temporary vectorial force */
1578             tx               = _mm_mul_ps(fscal,dx01);
1579             ty               = _mm_mul_ps(fscal,dy01);
1580             tz               = _mm_mul_ps(fscal,dz01);
1581
1582             /* Update vectorial force */
1583             fix0             = _mm_add_ps(fix0,tx);
1584             fiy0             = _mm_add_ps(fiy0,ty);
1585             fiz0             = _mm_add_ps(fiz0,tz);
1586
1587             fjx1             = _mm_add_ps(fjx1,tx);
1588             fjy1             = _mm_add_ps(fjy1,ty);
1589             fjz1             = _mm_add_ps(fjz1,tz);
1590             
1591             /**************************
1592              * CALCULATE INTERACTIONS *
1593              **************************/
1594
1595             r02              = _mm_mul_ps(rsq02,rinv02);
1596
1597             /* Calculate table index by multiplying r with table scale and truncate to integer */
1598             rt               = _mm_mul_ps(r02,vftabscale);
1599             vfitab           = _mm_cvttps_epi32(rt);
1600             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1601             vfitab           = _mm_slli_epi32(vfitab,2);
1602
1603             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1604             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1605             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1606             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1607             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1608             _MM_TRANSPOSE4_PS(Y,F,G,H);
1609             Heps             = _mm_mul_ps(vfeps,H);
1610             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1611             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1612             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1613
1614             fscal            = felec;
1615
1616             /* Calculate temporary vectorial force */
1617             tx               = _mm_mul_ps(fscal,dx02);
1618             ty               = _mm_mul_ps(fscal,dy02);
1619             tz               = _mm_mul_ps(fscal,dz02);
1620
1621             /* Update vectorial force */
1622             fix0             = _mm_add_ps(fix0,tx);
1623             fiy0             = _mm_add_ps(fiy0,ty);
1624             fiz0             = _mm_add_ps(fiz0,tz);
1625
1626             fjx2             = _mm_add_ps(fjx2,tx);
1627             fjy2             = _mm_add_ps(fjy2,ty);
1628             fjz2             = _mm_add_ps(fjz2,tz);
1629             
1630             /**************************
1631              * CALCULATE INTERACTIONS *
1632              **************************/
1633
1634             r10              = _mm_mul_ps(rsq10,rinv10);
1635
1636             /* Calculate table index by multiplying r with table scale and truncate to integer */
1637             rt               = _mm_mul_ps(r10,vftabscale);
1638             vfitab           = _mm_cvttps_epi32(rt);
1639             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1640             vfitab           = _mm_slli_epi32(vfitab,2);
1641
1642             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1643             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1644             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1645             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1646             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1647             _MM_TRANSPOSE4_PS(Y,F,G,H);
1648             Heps             = _mm_mul_ps(vfeps,H);
1649             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1650             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1651             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1652
1653             fscal            = felec;
1654
1655             /* Calculate temporary vectorial force */
1656             tx               = _mm_mul_ps(fscal,dx10);
1657             ty               = _mm_mul_ps(fscal,dy10);
1658             tz               = _mm_mul_ps(fscal,dz10);
1659
1660             /* Update vectorial force */
1661             fix1             = _mm_add_ps(fix1,tx);
1662             fiy1             = _mm_add_ps(fiy1,ty);
1663             fiz1             = _mm_add_ps(fiz1,tz);
1664
1665             fjx0             = _mm_add_ps(fjx0,tx);
1666             fjy0             = _mm_add_ps(fjy0,ty);
1667             fjz0             = _mm_add_ps(fjz0,tz);
1668             
1669             /**************************
1670              * CALCULATE INTERACTIONS *
1671              **************************/
1672
1673             r11              = _mm_mul_ps(rsq11,rinv11);
1674
1675             /* Calculate table index by multiplying r with table scale and truncate to integer */
1676             rt               = _mm_mul_ps(r11,vftabscale);
1677             vfitab           = _mm_cvttps_epi32(rt);
1678             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1679             vfitab           = _mm_slli_epi32(vfitab,2);
1680
1681             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1682             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1683             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1684             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1685             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1686             _MM_TRANSPOSE4_PS(Y,F,G,H);
1687             Heps             = _mm_mul_ps(vfeps,H);
1688             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1689             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1690             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1691
1692             fscal            = felec;
1693
1694             /* Calculate temporary vectorial force */
1695             tx               = _mm_mul_ps(fscal,dx11);
1696             ty               = _mm_mul_ps(fscal,dy11);
1697             tz               = _mm_mul_ps(fscal,dz11);
1698
1699             /* Update vectorial force */
1700             fix1             = _mm_add_ps(fix1,tx);
1701             fiy1             = _mm_add_ps(fiy1,ty);
1702             fiz1             = _mm_add_ps(fiz1,tz);
1703
1704             fjx1             = _mm_add_ps(fjx1,tx);
1705             fjy1             = _mm_add_ps(fjy1,ty);
1706             fjz1             = _mm_add_ps(fjz1,tz);
1707             
1708             /**************************
1709              * CALCULATE INTERACTIONS *
1710              **************************/
1711
1712             r12              = _mm_mul_ps(rsq12,rinv12);
1713
1714             /* Calculate table index by multiplying r with table scale and truncate to integer */
1715             rt               = _mm_mul_ps(r12,vftabscale);
1716             vfitab           = _mm_cvttps_epi32(rt);
1717             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1718             vfitab           = _mm_slli_epi32(vfitab,2);
1719
1720             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1721             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1722             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1723             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1724             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1725             _MM_TRANSPOSE4_PS(Y,F,G,H);
1726             Heps             = _mm_mul_ps(vfeps,H);
1727             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1728             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1729             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1730
1731             fscal            = felec;
1732
1733             /* Calculate temporary vectorial force */
1734             tx               = _mm_mul_ps(fscal,dx12);
1735             ty               = _mm_mul_ps(fscal,dy12);
1736             tz               = _mm_mul_ps(fscal,dz12);
1737
1738             /* Update vectorial force */
1739             fix1             = _mm_add_ps(fix1,tx);
1740             fiy1             = _mm_add_ps(fiy1,ty);
1741             fiz1             = _mm_add_ps(fiz1,tz);
1742
1743             fjx2             = _mm_add_ps(fjx2,tx);
1744             fjy2             = _mm_add_ps(fjy2,ty);
1745             fjz2             = _mm_add_ps(fjz2,tz);
1746             
1747             /**************************
1748              * CALCULATE INTERACTIONS *
1749              **************************/
1750
1751             r20              = _mm_mul_ps(rsq20,rinv20);
1752
1753             /* Calculate table index by multiplying r with table scale and truncate to integer */
1754             rt               = _mm_mul_ps(r20,vftabscale);
1755             vfitab           = _mm_cvttps_epi32(rt);
1756             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1757             vfitab           = _mm_slli_epi32(vfitab,2);
1758
1759             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1760             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1761             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1762             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1763             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1764             _MM_TRANSPOSE4_PS(Y,F,G,H);
1765             Heps             = _mm_mul_ps(vfeps,H);
1766             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1767             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1768             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1769
1770             fscal            = felec;
1771
1772             /* Calculate temporary vectorial force */
1773             tx               = _mm_mul_ps(fscal,dx20);
1774             ty               = _mm_mul_ps(fscal,dy20);
1775             tz               = _mm_mul_ps(fscal,dz20);
1776
1777             /* Update vectorial force */
1778             fix2             = _mm_add_ps(fix2,tx);
1779             fiy2             = _mm_add_ps(fiy2,ty);
1780             fiz2             = _mm_add_ps(fiz2,tz);
1781
1782             fjx0             = _mm_add_ps(fjx0,tx);
1783             fjy0             = _mm_add_ps(fjy0,ty);
1784             fjz0             = _mm_add_ps(fjz0,tz);
1785             
1786             /**************************
1787              * CALCULATE INTERACTIONS *
1788              **************************/
1789
1790             r21              = _mm_mul_ps(rsq21,rinv21);
1791
1792             /* Calculate table index by multiplying r with table scale and truncate to integer */
1793             rt               = _mm_mul_ps(r21,vftabscale);
1794             vfitab           = _mm_cvttps_epi32(rt);
1795             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1796             vfitab           = _mm_slli_epi32(vfitab,2);
1797
1798             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1799             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1800             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1801             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1802             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1803             _MM_TRANSPOSE4_PS(Y,F,G,H);
1804             Heps             = _mm_mul_ps(vfeps,H);
1805             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1806             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1807             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1808
1809             fscal            = felec;
1810
1811             /* Calculate temporary vectorial force */
1812             tx               = _mm_mul_ps(fscal,dx21);
1813             ty               = _mm_mul_ps(fscal,dy21);
1814             tz               = _mm_mul_ps(fscal,dz21);
1815
1816             /* Update vectorial force */
1817             fix2             = _mm_add_ps(fix2,tx);
1818             fiy2             = _mm_add_ps(fiy2,ty);
1819             fiz2             = _mm_add_ps(fiz2,tz);
1820
1821             fjx1             = _mm_add_ps(fjx1,tx);
1822             fjy1             = _mm_add_ps(fjy1,ty);
1823             fjz1             = _mm_add_ps(fjz1,tz);
1824             
1825             /**************************
1826              * CALCULATE INTERACTIONS *
1827              **************************/
1828
1829             r22              = _mm_mul_ps(rsq22,rinv22);
1830
1831             /* Calculate table index by multiplying r with table scale and truncate to integer */
1832             rt               = _mm_mul_ps(r22,vftabscale);
1833             vfitab           = _mm_cvttps_epi32(rt);
1834             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1835             vfitab           = _mm_slli_epi32(vfitab,2);
1836
1837             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1838             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1839             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1840             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1841             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1842             _MM_TRANSPOSE4_PS(Y,F,G,H);
1843             Heps             = _mm_mul_ps(vfeps,H);
1844             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1845             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1846             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1847
1848             fscal            = felec;
1849
1850             /* Calculate temporary vectorial force */
1851             tx               = _mm_mul_ps(fscal,dx22);
1852             ty               = _mm_mul_ps(fscal,dy22);
1853             tz               = _mm_mul_ps(fscal,dz22);
1854
1855             /* Update vectorial force */
1856             fix2             = _mm_add_ps(fix2,tx);
1857             fiy2             = _mm_add_ps(fiy2,ty);
1858             fiz2             = _mm_add_ps(fiz2,tz);
1859
1860             fjx2             = _mm_add_ps(fjx2,tx);
1861             fjy2             = _mm_add_ps(fjy2,ty);
1862             fjz2             = _mm_add_ps(fjz2,tz);
1863             
1864             fjptrA             = f+j_coord_offsetA;
1865             fjptrB             = f+j_coord_offsetB;
1866             fjptrC             = f+j_coord_offsetC;
1867             fjptrD             = f+j_coord_offsetD;
1868
1869             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1870                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1871
1872             /* Inner loop uses 359 flops */
1873         }
1874
1875         if(jidx<j_index_end)
1876         {
1877
1878             /* Get j neighbor index, and coordinate index */
1879             jnrlistA         = jjnr[jidx];
1880             jnrlistB         = jjnr[jidx+1];
1881             jnrlistC         = jjnr[jidx+2];
1882             jnrlistD         = jjnr[jidx+3];
1883             /* Sign of each element will be negative for non-real atoms.
1884              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1885              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1886              */
1887             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1888             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1889             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1890             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1891             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1892             j_coord_offsetA  = DIM*jnrA;
1893             j_coord_offsetB  = DIM*jnrB;
1894             j_coord_offsetC  = DIM*jnrC;
1895             j_coord_offsetD  = DIM*jnrD;
1896
1897             /* load j atom coordinates */
1898             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1899                                               x+j_coord_offsetC,x+j_coord_offsetD,
1900                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1901
1902             /* Calculate displacement vector */
1903             dx00             = _mm_sub_ps(ix0,jx0);
1904             dy00             = _mm_sub_ps(iy0,jy0);
1905             dz00             = _mm_sub_ps(iz0,jz0);
1906             dx01             = _mm_sub_ps(ix0,jx1);
1907             dy01             = _mm_sub_ps(iy0,jy1);
1908             dz01             = _mm_sub_ps(iz0,jz1);
1909             dx02             = _mm_sub_ps(ix0,jx2);
1910             dy02             = _mm_sub_ps(iy0,jy2);
1911             dz02             = _mm_sub_ps(iz0,jz2);
1912             dx10             = _mm_sub_ps(ix1,jx0);
1913             dy10             = _mm_sub_ps(iy1,jy0);
1914             dz10             = _mm_sub_ps(iz1,jz0);
1915             dx11             = _mm_sub_ps(ix1,jx1);
1916             dy11             = _mm_sub_ps(iy1,jy1);
1917             dz11             = _mm_sub_ps(iz1,jz1);
1918             dx12             = _mm_sub_ps(ix1,jx2);
1919             dy12             = _mm_sub_ps(iy1,jy2);
1920             dz12             = _mm_sub_ps(iz1,jz2);
1921             dx20             = _mm_sub_ps(ix2,jx0);
1922             dy20             = _mm_sub_ps(iy2,jy0);
1923             dz20             = _mm_sub_ps(iz2,jz0);
1924             dx21             = _mm_sub_ps(ix2,jx1);
1925             dy21             = _mm_sub_ps(iy2,jy1);
1926             dz21             = _mm_sub_ps(iz2,jz1);
1927             dx22             = _mm_sub_ps(ix2,jx2);
1928             dy22             = _mm_sub_ps(iy2,jy2);
1929             dz22             = _mm_sub_ps(iz2,jz2);
1930
1931             /* Calculate squared distance and things based on it */
1932             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1933             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1934             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1935             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1936             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1937             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1938             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1939             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1940             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1941
1942             rinv00           = sse2_invsqrt_f(rsq00);
1943             rinv01           = sse2_invsqrt_f(rsq01);
1944             rinv02           = sse2_invsqrt_f(rsq02);
1945             rinv10           = sse2_invsqrt_f(rsq10);
1946             rinv11           = sse2_invsqrt_f(rsq11);
1947             rinv12           = sse2_invsqrt_f(rsq12);
1948             rinv20           = sse2_invsqrt_f(rsq20);
1949             rinv21           = sse2_invsqrt_f(rsq21);
1950             rinv22           = sse2_invsqrt_f(rsq22);
1951
1952             rinvsq00         = _mm_mul_ps(rinv00,rinv00);
1953
1954             fjx0             = _mm_setzero_ps();
1955             fjy0             = _mm_setzero_ps();
1956             fjz0             = _mm_setzero_ps();
1957             fjx1             = _mm_setzero_ps();
1958             fjy1             = _mm_setzero_ps();
1959             fjz1             = _mm_setzero_ps();
1960             fjx2             = _mm_setzero_ps();
1961             fjy2             = _mm_setzero_ps();
1962             fjz2             = _mm_setzero_ps();
1963
1964             /**************************
1965              * CALCULATE INTERACTIONS *
1966              **************************/
1967
1968             r00              = _mm_mul_ps(rsq00,rinv00);
1969             r00              = _mm_andnot_ps(dummy_mask,r00);
1970
1971             /* Calculate table index by multiplying r with table scale and truncate to integer */
1972             rt               = _mm_mul_ps(r00,vftabscale);
1973             vfitab           = _mm_cvttps_epi32(rt);
1974             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1975             vfitab           = _mm_slli_epi32(vfitab,2);
1976
1977             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1978             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1979             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1980             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1981             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1982             _MM_TRANSPOSE4_PS(Y,F,G,H);
1983             Heps             = _mm_mul_ps(vfeps,H);
1984             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1985             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1986             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1987
1988             /* LENNARD-JONES DISPERSION/REPULSION */
1989
1990             rinvsix          = _mm_mul_ps(_mm_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1991             fvdw             = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(c12_00,rinvsix),c6_00),_mm_mul_ps(rinvsix,rinvsq00));
1992
1993             fscal            = _mm_add_ps(felec,fvdw);
1994
1995             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1996
1997             /* Calculate temporary vectorial force */
1998             tx               = _mm_mul_ps(fscal,dx00);
1999             ty               = _mm_mul_ps(fscal,dy00);
2000             tz               = _mm_mul_ps(fscal,dz00);
2001
2002             /* Update vectorial force */
2003             fix0             = _mm_add_ps(fix0,tx);
2004             fiy0             = _mm_add_ps(fiy0,ty);
2005             fiz0             = _mm_add_ps(fiz0,tz);
2006
2007             fjx0             = _mm_add_ps(fjx0,tx);
2008             fjy0             = _mm_add_ps(fjy0,ty);
2009             fjz0             = _mm_add_ps(fjz0,tz);
2010             
2011             /**************************
2012              * CALCULATE INTERACTIONS *
2013              **************************/
2014
2015             r01              = _mm_mul_ps(rsq01,rinv01);
2016             r01              = _mm_andnot_ps(dummy_mask,r01);
2017
2018             /* Calculate table index by multiplying r with table scale and truncate to integer */
2019             rt               = _mm_mul_ps(r01,vftabscale);
2020             vfitab           = _mm_cvttps_epi32(rt);
2021             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2022             vfitab           = _mm_slli_epi32(vfitab,2);
2023
2024             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2025             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2026             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2027             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2028             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2029             _MM_TRANSPOSE4_PS(Y,F,G,H);
2030             Heps             = _mm_mul_ps(vfeps,H);
2031             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2032             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2033             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
2034
2035             fscal            = felec;
2036
2037             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2038
2039             /* Calculate temporary vectorial force */
2040             tx               = _mm_mul_ps(fscal,dx01);
2041             ty               = _mm_mul_ps(fscal,dy01);
2042             tz               = _mm_mul_ps(fscal,dz01);
2043
2044             /* Update vectorial force */
2045             fix0             = _mm_add_ps(fix0,tx);
2046             fiy0             = _mm_add_ps(fiy0,ty);
2047             fiz0             = _mm_add_ps(fiz0,tz);
2048
2049             fjx1             = _mm_add_ps(fjx1,tx);
2050             fjy1             = _mm_add_ps(fjy1,ty);
2051             fjz1             = _mm_add_ps(fjz1,tz);
2052             
2053             /**************************
2054              * CALCULATE INTERACTIONS *
2055              **************************/
2056
2057             r02              = _mm_mul_ps(rsq02,rinv02);
2058             r02              = _mm_andnot_ps(dummy_mask,r02);
2059
2060             /* Calculate table index by multiplying r with table scale and truncate to integer */
2061             rt               = _mm_mul_ps(r02,vftabscale);
2062             vfitab           = _mm_cvttps_epi32(rt);
2063             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2064             vfitab           = _mm_slli_epi32(vfitab,2);
2065
2066             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2067             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2068             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2069             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2070             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2071             _MM_TRANSPOSE4_PS(Y,F,G,H);
2072             Heps             = _mm_mul_ps(vfeps,H);
2073             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2074             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2075             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
2076
2077             fscal            = felec;
2078
2079             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2080
2081             /* Calculate temporary vectorial force */
2082             tx               = _mm_mul_ps(fscal,dx02);
2083             ty               = _mm_mul_ps(fscal,dy02);
2084             tz               = _mm_mul_ps(fscal,dz02);
2085
2086             /* Update vectorial force */
2087             fix0             = _mm_add_ps(fix0,tx);
2088             fiy0             = _mm_add_ps(fiy0,ty);
2089             fiz0             = _mm_add_ps(fiz0,tz);
2090
2091             fjx2             = _mm_add_ps(fjx2,tx);
2092             fjy2             = _mm_add_ps(fjy2,ty);
2093             fjz2             = _mm_add_ps(fjz2,tz);
2094             
2095             /**************************
2096              * CALCULATE INTERACTIONS *
2097              **************************/
2098
2099             r10              = _mm_mul_ps(rsq10,rinv10);
2100             r10              = _mm_andnot_ps(dummy_mask,r10);
2101
2102             /* Calculate table index by multiplying r with table scale and truncate to integer */
2103             rt               = _mm_mul_ps(r10,vftabscale);
2104             vfitab           = _mm_cvttps_epi32(rt);
2105             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2106             vfitab           = _mm_slli_epi32(vfitab,2);
2107
2108             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2109             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2110             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2111             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2112             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2113             _MM_TRANSPOSE4_PS(Y,F,G,H);
2114             Heps             = _mm_mul_ps(vfeps,H);
2115             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2116             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2117             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
2118
2119             fscal            = felec;
2120
2121             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2122
2123             /* Calculate temporary vectorial force */
2124             tx               = _mm_mul_ps(fscal,dx10);
2125             ty               = _mm_mul_ps(fscal,dy10);
2126             tz               = _mm_mul_ps(fscal,dz10);
2127
2128             /* Update vectorial force */
2129             fix1             = _mm_add_ps(fix1,tx);
2130             fiy1             = _mm_add_ps(fiy1,ty);
2131             fiz1             = _mm_add_ps(fiz1,tz);
2132
2133             fjx0             = _mm_add_ps(fjx0,tx);
2134             fjy0             = _mm_add_ps(fjy0,ty);
2135             fjz0             = _mm_add_ps(fjz0,tz);
2136             
2137             /**************************
2138              * CALCULATE INTERACTIONS *
2139              **************************/
2140
2141             r11              = _mm_mul_ps(rsq11,rinv11);
2142             r11              = _mm_andnot_ps(dummy_mask,r11);
2143
2144             /* Calculate table index by multiplying r with table scale and truncate to integer */
2145             rt               = _mm_mul_ps(r11,vftabscale);
2146             vfitab           = _mm_cvttps_epi32(rt);
2147             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2148             vfitab           = _mm_slli_epi32(vfitab,2);
2149
2150             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2151             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2152             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2153             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2154             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2155             _MM_TRANSPOSE4_PS(Y,F,G,H);
2156             Heps             = _mm_mul_ps(vfeps,H);
2157             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2158             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2159             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2160
2161             fscal            = felec;
2162
2163             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2164
2165             /* Calculate temporary vectorial force */
2166             tx               = _mm_mul_ps(fscal,dx11);
2167             ty               = _mm_mul_ps(fscal,dy11);
2168             tz               = _mm_mul_ps(fscal,dz11);
2169
2170             /* Update vectorial force */
2171             fix1             = _mm_add_ps(fix1,tx);
2172             fiy1             = _mm_add_ps(fiy1,ty);
2173             fiz1             = _mm_add_ps(fiz1,tz);
2174
2175             fjx1             = _mm_add_ps(fjx1,tx);
2176             fjy1             = _mm_add_ps(fjy1,ty);
2177             fjz1             = _mm_add_ps(fjz1,tz);
2178             
2179             /**************************
2180              * CALCULATE INTERACTIONS *
2181              **************************/
2182
2183             r12              = _mm_mul_ps(rsq12,rinv12);
2184             r12              = _mm_andnot_ps(dummy_mask,r12);
2185
2186             /* Calculate table index by multiplying r with table scale and truncate to integer */
2187             rt               = _mm_mul_ps(r12,vftabscale);
2188             vfitab           = _mm_cvttps_epi32(rt);
2189             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2190             vfitab           = _mm_slli_epi32(vfitab,2);
2191
2192             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2193             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2194             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2195             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2196             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2197             _MM_TRANSPOSE4_PS(Y,F,G,H);
2198             Heps             = _mm_mul_ps(vfeps,H);
2199             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2200             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2201             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2202
2203             fscal            = felec;
2204
2205             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2206
2207             /* Calculate temporary vectorial force */
2208             tx               = _mm_mul_ps(fscal,dx12);
2209             ty               = _mm_mul_ps(fscal,dy12);
2210             tz               = _mm_mul_ps(fscal,dz12);
2211
2212             /* Update vectorial force */
2213             fix1             = _mm_add_ps(fix1,tx);
2214             fiy1             = _mm_add_ps(fiy1,ty);
2215             fiz1             = _mm_add_ps(fiz1,tz);
2216
2217             fjx2             = _mm_add_ps(fjx2,tx);
2218             fjy2             = _mm_add_ps(fjy2,ty);
2219             fjz2             = _mm_add_ps(fjz2,tz);
2220             
2221             /**************************
2222              * CALCULATE INTERACTIONS *
2223              **************************/
2224
2225             r20              = _mm_mul_ps(rsq20,rinv20);
2226             r20              = _mm_andnot_ps(dummy_mask,r20);
2227
2228             /* Calculate table index by multiplying r with table scale and truncate to integer */
2229             rt               = _mm_mul_ps(r20,vftabscale);
2230             vfitab           = _mm_cvttps_epi32(rt);
2231             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2232             vfitab           = _mm_slli_epi32(vfitab,2);
2233
2234             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2235             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2236             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2237             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2238             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2239             _MM_TRANSPOSE4_PS(Y,F,G,H);
2240             Heps             = _mm_mul_ps(vfeps,H);
2241             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2242             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2243             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
2244
2245             fscal            = felec;
2246
2247             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2248
2249             /* Calculate temporary vectorial force */
2250             tx               = _mm_mul_ps(fscal,dx20);
2251             ty               = _mm_mul_ps(fscal,dy20);
2252             tz               = _mm_mul_ps(fscal,dz20);
2253
2254             /* Update vectorial force */
2255             fix2             = _mm_add_ps(fix2,tx);
2256             fiy2             = _mm_add_ps(fiy2,ty);
2257             fiz2             = _mm_add_ps(fiz2,tz);
2258
2259             fjx0             = _mm_add_ps(fjx0,tx);
2260             fjy0             = _mm_add_ps(fjy0,ty);
2261             fjz0             = _mm_add_ps(fjz0,tz);
2262             
2263             /**************************
2264              * CALCULATE INTERACTIONS *
2265              **************************/
2266
2267             r21              = _mm_mul_ps(rsq21,rinv21);
2268             r21              = _mm_andnot_ps(dummy_mask,r21);
2269
2270             /* Calculate table index by multiplying r with table scale and truncate to integer */
2271             rt               = _mm_mul_ps(r21,vftabscale);
2272             vfitab           = _mm_cvttps_epi32(rt);
2273             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2274             vfitab           = _mm_slli_epi32(vfitab,2);
2275
2276             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2277             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2278             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2279             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2280             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2281             _MM_TRANSPOSE4_PS(Y,F,G,H);
2282             Heps             = _mm_mul_ps(vfeps,H);
2283             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2284             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2285             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2286
2287             fscal            = felec;
2288
2289             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2290
2291             /* Calculate temporary vectorial force */
2292             tx               = _mm_mul_ps(fscal,dx21);
2293             ty               = _mm_mul_ps(fscal,dy21);
2294             tz               = _mm_mul_ps(fscal,dz21);
2295
2296             /* Update vectorial force */
2297             fix2             = _mm_add_ps(fix2,tx);
2298             fiy2             = _mm_add_ps(fiy2,ty);
2299             fiz2             = _mm_add_ps(fiz2,tz);
2300
2301             fjx1             = _mm_add_ps(fjx1,tx);
2302             fjy1             = _mm_add_ps(fjy1,ty);
2303             fjz1             = _mm_add_ps(fjz1,tz);
2304             
2305             /**************************
2306              * CALCULATE INTERACTIONS *
2307              **************************/
2308
2309             r22              = _mm_mul_ps(rsq22,rinv22);
2310             r22              = _mm_andnot_ps(dummy_mask,r22);
2311
2312             /* Calculate table index by multiplying r with table scale and truncate to integer */
2313             rt               = _mm_mul_ps(r22,vftabscale);
2314             vfitab           = _mm_cvttps_epi32(rt);
2315             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2316             vfitab           = _mm_slli_epi32(vfitab,2);
2317
2318             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2319             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2320             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2321             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2322             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2323             _MM_TRANSPOSE4_PS(Y,F,G,H);
2324             Heps             = _mm_mul_ps(vfeps,H);
2325             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2326             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2327             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2328
2329             fscal            = felec;
2330
2331             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2332
2333             /* Calculate temporary vectorial force */
2334             tx               = _mm_mul_ps(fscal,dx22);
2335             ty               = _mm_mul_ps(fscal,dy22);
2336             tz               = _mm_mul_ps(fscal,dz22);
2337
2338             /* Update vectorial force */
2339             fix2             = _mm_add_ps(fix2,tx);
2340             fiy2             = _mm_add_ps(fiy2,ty);
2341             fiz2             = _mm_add_ps(fiz2,tz);
2342
2343             fjx2             = _mm_add_ps(fjx2,tx);
2344             fjy2             = _mm_add_ps(fjy2,ty);
2345             fjz2             = _mm_add_ps(fjz2,tz);
2346             
2347             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2348             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2349             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2350             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2351
2352             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2353                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2354
2355             /* Inner loop uses 368 flops */
2356         }
2357
2358         /* End of innermost loop */
2359
2360         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2361                                               f+i_coord_offset,fshift+i_shift_offset);
2362
2363         /* Increment number of inner iterations */
2364         inneriter                  += j_index_end - j_index_start;
2365
2366         /* Outer loop uses 18 flops */
2367     }
2368
2369     /* Increment number of outer iterations */
2370     outeriter        += nri;
2371
2372     /* Update outer/inner flops */
2373
2374     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*368);
2375 }