Remove all unnecessary HAVE_CONFIG_H
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "nrnb.h"
46
47 #include "gromacs/simd/math_x86_sse2_single.h"
48 #include "kernelutil_x86_sse2_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single
52  * Electrostatics interaction: CubicSplineTable
53  * VdW interaction:            CubicSplineTable
54  * Geometry:                   Water3-Water3
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_sse2_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
78     real             rcutoff_scalar;
79     real             *shiftvec,*fshift,*x,*f;
80     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
81     real             scratch[4*DIM];
82     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83     int              vdwioffset0;
84     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85     int              vdwioffset1;
86     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     int              vdwioffset2;
88     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
90     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
92     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
94     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
97     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
98     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
99     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
100     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
101     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
102     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
103     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
104     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
105     real             *charge;
106     int              nvdwtype;
107     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
108     int              *vdwtype;
109     real             *vdwparam;
110     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
111     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
112     __m128i          vfitab;
113     __m128i          ifour       = _mm_set1_epi32(4);
114     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
115     real             *vftab;
116     __m128           dummy_mask,cutoff_mask;
117     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
118     __m128           one     = _mm_set1_ps(1.0);
119     __m128           two     = _mm_set1_ps(2.0);
120     x                = xx[0];
121     f                = ff[0];
122
123     nri              = nlist->nri;
124     iinr             = nlist->iinr;
125     jindex           = nlist->jindex;
126     jjnr             = nlist->jjnr;
127     shiftidx         = nlist->shift;
128     gid              = nlist->gid;
129     shiftvec         = fr->shift_vec[0];
130     fshift           = fr->fshift[0];
131     facel            = _mm_set1_ps(fr->epsfac);
132     charge           = mdatoms->chargeA;
133     nvdwtype         = fr->ntype;
134     vdwparam         = fr->nbfp;
135     vdwtype          = mdatoms->typeA;
136
137     vftab            = kernel_data->table_elec_vdw->data;
138     vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
139
140     /* Setup water-specific parameters */
141     inr              = nlist->iinr[0];
142     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
143     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
144     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
145     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
146
147     jq0              = _mm_set1_ps(charge[inr+0]);
148     jq1              = _mm_set1_ps(charge[inr+1]);
149     jq2              = _mm_set1_ps(charge[inr+2]);
150     vdwjidx0A        = 2*vdwtype[inr+0];
151     qq00             = _mm_mul_ps(iq0,jq0);
152     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
153     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
154     qq01             = _mm_mul_ps(iq0,jq1);
155     qq02             = _mm_mul_ps(iq0,jq2);
156     qq10             = _mm_mul_ps(iq1,jq0);
157     qq11             = _mm_mul_ps(iq1,jq1);
158     qq12             = _mm_mul_ps(iq1,jq2);
159     qq20             = _mm_mul_ps(iq2,jq0);
160     qq21             = _mm_mul_ps(iq2,jq1);
161     qq22             = _mm_mul_ps(iq2,jq2);
162
163     /* Avoid stupid compiler warnings */
164     jnrA = jnrB = jnrC = jnrD = 0;
165     j_coord_offsetA = 0;
166     j_coord_offsetB = 0;
167     j_coord_offsetC = 0;
168     j_coord_offsetD = 0;
169
170     outeriter        = 0;
171     inneriter        = 0;
172
173     for(iidx=0;iidx<4*DIM;iidx++)
174     {
175         scratch[iidx] = 0.0;
176     }  
177
178     /* Start outer loop over neighborlists */
179     for(iidx=0; iidx<nri; iidx++)
180     {
181         /* Load shift vector for this list */
182         i_shift_offset   = DIM*shiftidx[iidx];
183
184         /* Load limits for loop over neighbors */
185         j_index_start    = jindex[iidx];
186         j_index_end      = jindex[iidx+1];
187
188         /* Get outer coordinate index */
189         inr              = iinr[iidx];
190         i_coord_offset   = DIM*inr;
191
192         /* Load i particle coords and add shift vector */
193         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
194                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
195         
196         fix0             = _mm_setzero_ps();
197         fiy0             = _mm_setzero_ps();
198         fiz0             = _mm_setzero_ps();
199         fix1             = _mm_setzero_ps();
200         fiy1             = _mm_setzero_ps();
201         fiz1             = _mm_setzero_ps();
202         fix2             = _mm_setzero_ps();
203         fiy2             = _mm_setzero_ps();
204         fiz2             = _mm_setzero_ps();
205
206         /* Reset potential sums */
207         velecsum         = _mm_setzero_ps();
208         vvdwsum          = _mm_setzero_ps();
209
210         /* Start inner kernel loop */
211         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
212         {
213
214             /* Get j neighbor index, and coordinate index */
215             jnrA             = jjnr[jidx];
216             jnrB             = jjnr[jidx+1];
217             jnrC             = jjnr[jidx+2];
218             jnrD             = jjnr[jidx+3];
219             j_coord_offsetA  = DIM*jnrA;
220             j_coord_offsetB  = DIM*jnrB;
221             j_coord_offsetC  = DIM*jnrC;
222             j_coord_offsetD  = DIM*jnrD;
223
224             /* load j atom coordinates */
225             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
226                                               x+j_coord_offsetC,x+j_coord_offsetD,
227                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
228
229             /* Calculate displacement vector */
230             dx00             = _mm_sub_ps(ix0,jx0);
231             dy00             = _mm_sub_ps(iy0,jy0);
232             dz00             = _mm_sub_ps(iz0,jz0);
233             dx01             = _mm_sub_ps(ix0,jx1);
234             dy01             = _mm_sub_ps(iy0,jy1);
235             dz01             = _mm_sub_ps(iz0,jz1);
236             dx02             = _mm_sub_ps(ix0,jx2);
237             dy02             = _mm_sub_ps(iy0,jy2);
238             dz02             = _mm_sub_ps(iz0,jz2);
239             dx10             = _mm_sub_ps(ix1,jx0);
240             dy10             = _mm_sub_ps(iy1,jy0);
241             dz10             = _mm_sub_ps(iz1,jz0);
242             dx11             = _mm_sub_ps(ix1,jx1);
243             dy11             = _mm_sub_ps(iy1,jy1);
244             dz11             = _mm_sub_ps(iz1,jz1);
245             dx12             = _mm_sub_ps(ix1,jx2);
246             dy12             = _mm_sub_ps(iy1,jy2);
247             dz12             = _mm_sub_ps(iz1,jz2);
248             dx20             = _mm_sub_ps(ix2,jx0);
249             dy20             = _mm_sub_ps(iy2,jy0);
250             dz20             = _mm_sub_ps(iz2,jz0);
251             dx21             = _mm_sub_ps(ix2,jx1);
252             dy21             = _mm_sub_ps(iy2,jy1);
253             dz21             = _mm_sub_ps(iz2,jz1);
254             dx22             = _mm_sub_ps(ix2,jx2);
255             dy22             = _mm_sub_ps(iy2,jy2);
256             dz22             = _mm_sub_ps(iz2,jz2);
257
258             /* Calculate squared distance and things based on it */
259             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
260             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
261             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
262             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
263             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
264             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
265             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
266             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
267             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
268
269             rinv00           = gmx_mm_invsqrt_ps(rsq00);
270             rinv01           = gmx_mm_invsqrt_ps(rsq01);
271             rinv02           = gmx_mm_invsqrt_ps(rsq02);
272             rinv10           = gmx_mm_invsqrt_ps(rsq10);
273             rinv11           = gmx_mm_invsqrt_ps(rsq11);
274             rinv12           = gmx_mm_invsqrt_ps(rsq12);
275             rinv20           = gmx_mm_invsqrt_ps(rsq20);
276             rinv21           = gmx_mm_invsqrt_ps(rsq21);
277             rinv22           = gmx_mm_invsqrt_ps(rsq22);
278
279             fjx0             = _mm_setzero_ps();
280             fjy0             = _mm_setzero_ps();
281             fjz0             = _mm_setzero_ps();
282             fjx1             = _mm_setzero_ps();
283             fjy1             = _mm_setzero_ps();
284             fjz1             = _mm_setzero_ps();
285             fjx2             = _mm_setzero_ps();
286             fjy2             = _mm_setzero_ps();
287             fjz2             = _mm_setzero_ps();
288
289             /**************************
290              * CALCULATE INTERACTIONS *
291              **************************/
292
293             r00              = _mm_mul_ps(rsq00,rinv00);
294
295             /* Calculate table index by multiplying r with table scale and truncate to integer */
296             rt               = _mm_mul_ps(r00,vftabscale);
297             vfitab           = _mm_cvttps_epi32(rt);
298             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
299             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
300
301             /* CUBIC SPLINE TABLE ELECTROSTATICS */
302             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
303             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
304             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
305             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
306             _MM_TRANSPOSE4_PS(Y,F,G,H);
307             Heps             = _mm_mul_ps(vfeps,H);
308             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
309             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
310             velec            = _mm_mul_ps(qq00,VV);
311             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
312             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
313
314             /* CUBIC SPLINE TABLE DISPERSION */
315             vfitab           = _mm_add_epi32(vfitab,ifour);
316             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
317             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
318             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
319             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
320             _MM_TRANSPOSE4_PS(Y,F,G,H);
321             Heps             = _mm_mul_ps(vfeps,H);
322             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
323             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
324             vvdw6            = _mm_mul_ps(c6_00,VV);
325             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
326             fvdw6            = _mm_mul_ps(c6_00,FF);
327
328             /* CUBIC SPLINE TABLE REPULSION */
329             vfitab           = _mm_add_epi32(vfitab,ifour);
330             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
331             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
332             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
333             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
334             _MM_TRANSPOSE4_PS(Y,F,G,H);
335             Heps             = _mm_mul_ps(vfeps,H);
336             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
337             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
338             vvdw12           = _mm_mul_ps(c12_00,VV);
339             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
340             fvdw12           = _mm_mul_ps(c12_00,FF);
341             vvdw             = _mm_add_ps(vvdw12,vvdw6);
342             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
343
344             /* Update potential sum for this i atom from the interaction with this j atom. */
345             velecsum         = _mm_add_ps(velecsum,velec);
346             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
347
348             fscal            = _mm_add_ps(felec,fvdw);
349
350             /* Calculate temporary vectorial force */
351             tx               = _mm_mul_ps(fscal,dx00);
352             ty               = _mm_mul_ps(fscal,dy00);
353             tz               = _mm_mul_ps(fscal,dz00);
354
355             /* Update vectorial force */
356             fix0             = _mm_add_ps(fix0,tx);
357             fiy0             = _mm_add_ps(fiy0,ty);
358             fiz0             = _mm_add_ps(fiz0,tz);
359
360             fjx0             = _mm_add_ps(fjx0,tx);
361             fjy0             = _mm_add_ps(fjy0,ty);
362             fjz0             = _mm_add_ps(fjz0,tz);
363             
364             /**************************
365              * CALCULATE INTERACTIONS *
366              **************************/
367
368             r01              = _mm_mul_ps(rsq01,rinv01);
369
370             /* Calculate table index by multiplying r with table scale and truncate to integer */
371             rt               = _mm_mul_ps(r01,vftabscale);
372             vfitab           = _mm_cvttps_epi32(rt);
373             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
374             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
375
376             /* CUBIC SPLINE TABLE ELECTROSTATICS */
377             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
378             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
379             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
380             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
381             _MM_TRANSPOSE4_PS(Y,F,G,H);
382             Heps             = _mm_mul_ps(vfeps,H);
383             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
384             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
385             velec            = _mm_mul_ps(qq01,VV);
386             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
387             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
388
389             /* Update potential sum for this i atom from the interaction with this j atom. */
390             velecsum         = _mm_add_ps(velecsum,velec);
391
392             fscal            = felec;
393
394             /* Calculate temporary vectorial force */
395             tx               = _mm_mul_ps(fscal,dx01);
396             ty               = _mm_mul_ps(fscal,dy01);
397             tz               = _mm_mul_ps(fscal,dz01);
398
399             /* Update vectorial force */
400             fix0             = _mm_add_ps(fix0,tx);
401             fiy0             = _mm_add_ps(fiy0,ty);
402             fiz0             = _mm_add_ps(fiz0,tz);
403
404             fjx1             = _mm_add_ps(fjx1,tx);
405             fjy1             = _mm_add_ps(fjy1,ty);
406             fjz1             = _mm_add_ps(fjz1,tz);
407             
408             /**************************
409              * CALCULATE INTERACTIONS *
410              **************************/
411
412             r02              = _mm_mul_ps(rsq02,rinv02);
413
414             /* Calculate table index by multiplying r with table scale and truncate to integer */
415             rt               = _mm_mul_ps(r02,vftabscale);
416             vfitab           = _mm_cvttps_epi32(rt);
417             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
418             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
419
420             /* CUBIC SPLINE TABLE ELECTROSTATICS */
421             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
422             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
423             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
424             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
425             _MM_TRANSPOSE4_PS(Y,F,G,H);
426             Heps             = _mm_mul_ps(vfeps,H);
427             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
428             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
429             velec            = _mm_mul_ps(qq02,VV);
430             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
431             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
432
433             /* Update potential sum for this i atom from the interaction with this j atom. */
434             velecsum         = _mm_add_ps(velecsum,velec);
435
436             fscal            = felec;
437
438             /* Calculate temporary vectorial force */
439             tx               = _mm_mul_ps(fscal,dx02);
440             ty               = _mm_mul_ps(fscal,dy02);
441             tz               = _mm_mul_ps(fscal,dz02);
442
443             /* Update vectorial force */
444             fix0             = _mm_add_ps(fix0,tx);
445             fiy0             = _mm_add_ps(fiy0,ty);
446             fiz0             = _mm_add_ps(fiz0,tz);
447
448             fjx2             = _mm_add_ps(fjx2,tx);
449             fjy2             = _mm_add_ps(fjy2,ty);
450             fjz2             = _mm_add_ps(fjz2,tz);
451             
452             /**************************
453              * CALCULATE INTERACTIONS *
454              **************************/
455
456             r10              = _mm_mul_ps(rsq10,rinv10);
457
458             /* Calculate table index by multiplying r with table scale and truncate to integer */
459             rt               = _mm_mul_ps(r10,vftabscale);
460             vfitab           = _mm_cvttps_epi32(rt);
461             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
462             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
463
464             /* CUBIC SPLINE TABLE ELECTROSTATICS */
465             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
466             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
467             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
468             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
469             _MM_TRANSPOSE4_PS(Y,F,G,H);
470             Heps             = _mm_mul_ps(vfeps,H);
471             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
472             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
473             velec            = _mm_mul_ps(qq10,VV);
474             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
475             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
476
477             /* Update potential sum for this i atom from the interaction with this j atom. */
478             velecsum         = _mm_add_ps(velecsum,velec);
479
480             fscal            = felec;
481
482             /* Calculate temporary vectorial force */
483             tx               = _mm_mul_ps(fscal,dx10);
484             ty               = _mm_mul_ps(fscal,dy10);
485             tz               = _mm_mul_ps(fscal,dz10);
486
487             /* Update vectorial force */
488             fix1             = _mm_add_ps(fix1,tx);
489             fiy1             = _mm_add_ps(fiy1,ty);
490             fiz1             = _mm_add_ps(fiz1,tz);
491
492             fjx0             = _mm_add_ps(fjx0,tx);
493             fjy0             = _mm_add_ps(fjy0,ty);
494             fjz0             = _mm_add_ps(fjz0,tz);
495             
496             /**************************
497              * CALCULATE INTERACTIONS *
498              **************************/
499
500             r11              = _mm_mul_ps(rsq11,rinv11);
501
502             /* Calculate table index by multiplying r with table scale and truncate to integer */
503             rt               = _mm_mul_ps(r11,vftabscale);
504             vfitab           = _mm_cvttps_epi32(rt);
505             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
506             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
507
508             /* CUBIC SPLINE TABLE ELECTROSTATICS */
509             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
510             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
511             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
512             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
513             _MM_TRANSPOSE4_PS(Y,F,G,H);
514             Heps             = _mm_mul_ps(vfeps,H);
515             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
516             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
517             velec            = _mm_mul_ps(qq11,VV);
518             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
519             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
520
521             /* Update potential sum for this i atom from the interaction with this j atom. */
522             velecsum         = _mm_add_ps(velecsum,velec);
523
524             fscal            = felec;
525
526             /* Calculate temporary vectorial force */
527             tx               = _mm_mul_ps(fscal,dx11);
528             ty               = _mm_mul_ps(fscal,dy11);
529             tz               = _mm_mul_ps(fscal,dz11);
530
531             /* Update vectorial force */
532             fix1             = _mm_add_ps(fix1,tx);
533             fiy1             = _mm_add_ps(fiy1,ty);
534             fiz1             = _mm_add_ps(fiz1,tz);
535
536             fjx1             = _mm_add_ps(fjx1,tx);
537             fjy1             = _mm_add_ps(fjy1,ty);
538             fjz1             = _mm_add_ps(fjz1,tz);
539             
540             /**************************
541              * CALCULATE INTERACTIONS *
542              **************************/
543
544             r12              = _mm_mul_ps(rsq12,rinv12);
545
546             /* Calculate table index by multiplying r with table scale and truncate to integer */
547             rt               = _mm_mul_ps(r12,vftabscale);
548             vfitab           = _mm_cvttps_epi32(rt);
549             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
550             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
551
552             /* CUBIC SPLINE TABLE ELECTROSTATICS */
553             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
554             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
555             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
556             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
557             _MM_TRANSPOSE4_PS(Y,F,G,H);
558             Heps             = _mm_mul_ps(vfeps,H);
559             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
560             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
561             velec            = _mm_mul_ps(qq12,VV);
562             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
563             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
564
565             /* Update potential sum for this i atom from the interaction with this j atom. */
566             velecsum         = _mm_add_ps(velecsum,velec);
567
568             fscal            = felec;
569
570             /* Calculate temporary vectorial force */
571             tx               = _mm_mul_ps(fscal,dx12);
572             ty               = _mm_mul_ps(fscal,dy12);
573             tz               = _mm_mul_ps(fscal,dz12);
574
575             /* Update vectorial force */
576             fix1             = _mm_add_ps(fix1,tx);
577             fiy1             = _mm_add_ps(fiy1,ty);
578             fiz1             = _mm_add_ps(fiz1,tz);
579
580             fjx2             = _mm_add_ps(fjx2,tx);
581             fjy2             = _mm_add_ps(fjy2,ty);
582             fjz2             = _mm_add_ps(fjz2,tz);
583             
584             /**************************
585              * CALCULATE INTERACTIONS *
586              **************************/
587
588             r20              = _mm_mul_ps(rsq20,rinv20);
589
590             /* Calculate table index by multiplying r with table scale and truncate to integer */
591             rt               = _mm_mul_ps(r20,vftabscale);
592             vfitab           = _mm_cvttps_epi32(rt);
593             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
594             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
595
596             /* CUBIC SPLINE TABLE ELECTROSTATICS */
597             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
598             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
599             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
600             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
601             _MM_TRANSPOSE4_PS(Y,F,G,H);
602             Heps             = _mm_mul_ps(vfeps,H);
603             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
604             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
605             velec            = _mm_mul_ps(qq20,VV);
606             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
607             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
608
609             /* Update potential sum for this i atom from the interaction with this j atom. */
610             velecsum         = _mm_add_ps(velecsum,velec);
611
612             fscal            = felec;
613
614             /* Calculate temporary vectorial force */
615             tx               = _mm_mul_ps(fscal,dx20);
616             ty               = _mm_mul_ps(fscal,dy20);
617             tz               = _mm_mul_ps(fscal,dz20);
618
619             /* Update vectorial force */
620             fix2             = _mm_add_ps(fix2,tx);
621             fiy2             = _mm_add_ps(fiy2,ty);
622             fiz2             = _mm_add_ps(fiz2,tz);
623
624             fjx0             = _mm_add_ps(fjx0,tx);
625             fjy0             = _mm_add_ps(fjy0,ty);
626             fjz0             = _mm_add_ps(fjz0,tz);
627             
628             /**************************
629              * CALCULATE INTERACTIONS *
630              **************************/
631
632             r21              = _mm_mul_ps(rsq21,rinv21);
633
634             /* Calculate table index by multiplying r with table scale and truncate to integer */
635             rt               = _mm_mul_ps(r21,vftabscale);
636             vfitab           = _mm_cvttps_epi32(rt);
637             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
638             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
639
640             /* CUBIC SPLINE TABLE ELECTROSTATICS */
641             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
642             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
643             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
644             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
645             _MM_TRANSPOSE4_PS(Y,F,G,H);
646             Heps             = _mm_mul_ps(vfeps,H);
647             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
648             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
649             velec            = _mm_mul_ps(qq21,VV);
650             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
651             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
652
653             /* Update potential sum for this i atom from the interaction with this j atom. */
654             velecsum         = _mm_add_ps(velecsum,velec);
655
656             fscal            = felec;
657
658             /* Calculate temporary vectorial force */
659             tx               = _mm_mul_ps(fscal,dx21);
660             ty               = _mm_mul_ps(fscal,dy21);
661             tz               = _mm_mul_ps(fscal,dz21);
662
663             /* Update vectorial force */
664             fix2             = _mm_add_ps(fix2,tx);
665             fiy2             = _mm_add_ps(fiy2,ty);
666             fiz2             = _mm_add_ps(fiz2,tz);
667
668             fjx1             = _mm_add_ps(fjx1,tx);
669             fjy1             = _mm_add_ps(fjy1,ty);
670             fjz1             = _mm_add_ps(fjz1,tz);
671             
672             /**************************
673              * CALCULATE INTERACTIONS *
674              **************************/
675
676             r22              = _mm_mul_ps(rsq22,rinv22);
677
678             /* Calculate table index by multiplying r with table scale and truncate to integer */
679             rt               = _mm_mul_ps(r22,vftabscale);
680             vfitab           = _mm_cvttps_epi32(rt);
681             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
682             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
683
684             /* CUBIC SPLINE TABLE ELECTROSTATICS */
685             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
686             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
687             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
688             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
689             _MM_TRANSPOSE4_PS(Y,F,G,H);
690             Heps             = _mm_mul_ps(vfeps,H);
691             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
692             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
693             velec            = _mm_mul_ps(qq22,VV);
694             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
695             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
696
697             /* Update potential sum for this i atom from the interaction with this j atom. */
698             velecsum         = _mm_add_ps(velecsum,velec);
699
700             fscal            = felec;
701
702             /* Calculate temporary vectorial force */
703             tx               = _mm_mul_ps(fscal,dx22);
704             ty               = _mm_mul_ps(fscal,dy22);
705             tz               = _mm_mul_ps(fscal,dz22);
706
707             /* Update vectorial force */
708             fix2             = _mm_add_ps(fix2,tx);
709             fiy2             = _mm_add_ps(fiy2,ty);
710             fiz2             = _mm_add_ps(fiz2,tz);
711
712             fjx2             = _mm_add_ps(fjx2,tx);
713             fjy2             = _mm_add_ps(fjy2,ty);
714             fjz2             = _mm_add_ps(fjz2,tz);
715             
716             fjptrA             = f+j_coord_offsetA;
717             fjptrB             = f+j_coord_offsetB;
718             fjptrC             = f+j_coord_offsetC;
719             fjptrD             = f+j_coord_offsetD;
720
721             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
722                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
723
724             /* Inner loop uses 417 flops */
725         }
726
727         if(jidx<j_index_end)
728         {
729
730             /* Get j neighbor index, and coordinate index */
731             jnrlistA         = jjnr[jidx];
732             jnrlistB         = jjnr[jidx+1];
733             jnrlistC         = jjnr[jidx+2];
734             jnrlistD         = jjnr[jidx+3];
735             /* Sign of each element will be negative for non-real atoms.
736              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
737              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
738              */
739             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
740             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
741             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
742             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
743             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
744             j_coord_offsetA  = DIM*jnrA;
745             j_coord_offsetB  = DIM*jnrB;
746             j_coord_offsetC  = DIM*jnrC;
747             j_coord_offsetD  = DIM*jnrD;
748
749             /* load j atom coordinates */
750             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
751                                               x+j_coord_offsetC,x+j_coord_offsetD,
752                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
753
754             /* Calculate displacement vector */
755             dx00             = _mm_sub_ps(ix0,jx0);
756             dy00             = _mm_sub_ps(iy0,jy0);
757             dz00             = _mm_sub_ps(iz0,jz0);
758             dx01             = _mm_sub_ps(ix0,jx1);
759             dy01             = _mm_sub_ps(iy0,jy1);
760             dz01             = _mm_sub_ps(iz0,jz1);
761             dx02             = _mm_sub_ps(ix0,jx2);
762             dy02             = _mm_sub_ps(iy0,jy2);
763             dz02             = _mm_sub_ps(iz0,jz2);
764             dx10             = _mm_sub_ps(ix1,jx0);
765             dy10             = _mm_sub_ps(iy1,jy0);
766             dz10             = _mm_sub_ps(iz1,jz0);
767             dx11             = _mm_sub_ps(ix1,jx1);
768             dy11             = _mm_sub_ps(iy1,jy1);
769             dz11             = _mm_sub_ps(iz1,jz1);
770             dx12             = _mm_sub_ps(ix1,jx2);
771             dy12             = _mm_sub_ps(iy1,jy2);
772             dz12             = _mm_sub_ps(iz1,jz2);
773             dx20             = _mm_sub_ps(ix2,jx0);
774             dy20             = _mm_sub_ps(iy2,jy0);
775             dz20             = _mm_sub_ps(iz2,jz0);
776             dx21             = _mm_sub_ps(ix2,jx1);
777             dy21             = _mm_sub_ps(iy2,jy1);
778             dz21             = _mm_sub_ps(iz2,jz1);
779             dx22             = _mm_sub_ps(ix2,jx2);
780             dy22             = _mm_sub_ps(iy2,jy2);
781             dz22             = _mm_sub_ps(iz2,jz2);
782
783             /* Calculate squared distance and things based on it */
784             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
785             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
786             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
787             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
788             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
789             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
790             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
791             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
792             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
793
794             rinv00           = gmx_mm_invsqrt_ps(rsq00);
795             rinv01           = gmx_mm_invsqrt_ps(rsq01);
796             rinv02           = gmx_mm_invsqrt_ps(rsq02);
797             rinv10           = gmx_mm_invsqrt_ps(rsq10);
798             rinv11           = gmx_mm_invsqrt_ps(rsq11);
799             rinv12           = gmx_mm_invsqrt_ps(rsq12);
800             rinv20           = gmx_mm_invsqrt_ps(rsq20);
801             rinv21           = gmx_mm_invsqrt_ps(rsq21);
802             rinv22           = gmx_mm_invsqrt_ps(rsq22);
803
804             fjx0             = _mm_setzero_ps();
805             fjy0             = _mm_setzero_ps();
806             fjz0             = _mm_setzero_ps();
807             fjx1             = _mm_setzero_ps();
808             fjy1             = _mm_setzero_ps();
809             fjz1             = _mm_setzero_ps();
810             fjx2             = _mm_setzero_ps();
811             fjy2             = _mm_setzero_ps();
812             fjz2             = _mm_setzero_ps();
813
814             /**************************
815              * CALCULATE INTERACTIONS *
816              **************************/
817
818             r00              = _mm_mul_ps(rsq00,rinv00);
819             r00              = _mm_andnot_ps(dummy_mask,r00);
820
821             /* Calculate table index by multiplying r with table scale and truncate to integer */
822             rt               = _mm_mul_ps(r00,vftabscale);
823             vfitab           = _mm_cvttps_epi32(rt);
824             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
825             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
826
827             /* CUBIC SPLINE TABLE ELECTROSTATICS */
828             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
829             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
830             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
831             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
832             _MM_TRANSPOSE4_PS(Y,F,G,H);
833             Heps             = _mm_mul_ps(vfeps,H);
834             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
835             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
836             velec            = _mm_mul_ps(qq00,VV);
837             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
838             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
839
840             /* CUBIC SPLINE TABLE DISPERSION */
841             vfitab           = _mm_add_epi32(vfitab,ifour);
842             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
843             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
844             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
845             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
846             _MM_TRANSPOSE4_PS(Y,F,G,H);
847             Heps             = _mm_mul_ps(vfeps,H);
848             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
849             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
850             vvdw6            = _mm_mul_ps(c6_00,VV);
851             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
852             fvdw6            = _mm_mul_ps(c6_00,FF);
853
854             /* CUBIC SPLINE TABLE REPULSION */
855             vfitab           = _mm_add_epi32(vfitab,ifour);
856             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
857             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
858             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
859             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
860             _MM_TRANSPOSE4_PS(Y,F,G,H);
861             Heps             = _mm_mul_ps(vfeps,H);
862             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
863             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
864             vvdw12           = _mm_mul_ps(c12_00,VV);
865             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
866             fvdw12           = _mm_mul_ps(c12_00,FF);
867             vvdw             = _mm_add_ps(vvdw12,vvdw6);
868             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
869
870             /* Update potential sum for this i atom from the interaction with this j atom. */
871             velec            = _mm_andnot_ps(dummy_mask,velec);
872             velecsum         = _mm_add_ps(velecsum,velec);
873             vvdw             = _mm_andnot_ps(dummy_mask,vvdw);
874             vvdwsum          = _mm_add_ps(vvdwsum,vvdw);
875
876             fscal            = _mm_add_ps(felec,fvdw);
877
878             fscal            = _mm_andnot_ps(dummy_mask,fscal);
879
880             /* Calculate temporary vectorial force */
881             tx               = _mm_mul_ps(fscal,dx00);
882             ty               = _mm_mul_ps(fscal,dy00);
883             tz               = _mm_mul_ps(fscal,dz00);
884
885             /* Update vectorial force */
886             fix0             = _mm_add_ps(fix0,tx);
887             fiy0             = _mm_add_ps(fiy0,ty);
888             fiz0             = _mm_add_ps(fiz0,tz);
889
890             fjx0             = _mm_add_ps(fjx0,tx);
891             fjy0             = _mm_add_ps(fjy0,ty);
892             fjz0             = _mm_add_ps(fjz0,tz);
893             
894             /**************************
895              * CALCULATE INTERACTIONS *
896              **************************/
897
898             r01              = _mm_mul_ps(rsq01,rinv01);
899             r01              = _mm_andnot_ps(dummy_mask,r01);
900
901             /* Calculate table index by multiplying r with table scale and truncate to integer */
902             rt               = _mm_mul_ps(r01,vftabscale);
903             vfitab           = _mm_cvttps_epi32(rt);
904             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
905             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
906
907             /* CUBIC SPLINE TABLE ELECTROSTATICS */
908             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
909             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
910             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
911             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
912             _MM_TRANSPOSE4_PS(Y,F,G,H);
913             Heps             = _mm_mul_ps(vfeps,H);
914             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
915             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
916             velec            = _mm_mul_ps(qq01,VV);
917             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
918             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
919
920             /* Update potential sum for this i atom from the interaction with this j atom. */
921             velec            = _mm_andnot_ps(dummy_mask,velec);
922             velecsum         = _mm_add_ps(velecsum,velec);
923
924             fscal            = felec;
925
926             fscal            = _mm_andnot_ps(dummy_mask,fscal);
927
928             /* Calculate temporary vectorial force */
929             tx               = _mm_mul_ps(fscal,dx01);
930             ty               = _mm_mul_ps(fscal,dy01);
931             tz               = _mm_mul_ps(fscal,dz01);
932
933             /* Update vectorial force */
934             fix0             = _mm_add_ps(fix0,tx);
935             fiy0             = _mm_add_ps(fiy0,ty);
936             fiz0             = _mm_add_ps(fiz0,tz);
937
938             fjx1             = _mm_add_ps(fjx1,tx);
939             fjy1             = _mm_add_ps(fjy1,ty);
940             fjz1             = _mm_add_ps(fjz1,tz);
941             
942             /**************************
943              * CALCULATE INTERACTIONS *
944              **************************/
945
946             r02              = _mm_mul_ps(rsq02,rinv02);
947             r02              = _mm_andnot_ps(dummy_mask,r02);
948
949             /* Calculate table index by multiplying r with table scale and truncate to integer */
950             rt               = _mm_mul_ps(r02,vftabscale);
951             vfitab           = _mm_cvttps_epi32(rt);
952             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
953             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
954
955             /* CUBIC SPLINE TABLE ELECTROSTATICS */
956             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
957             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
958             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
959             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
960             _MM_TRANSPOSE4_PS(Y,F,G,H);
961             Heps             = _mm_mul_ps(vfeps,H);
962             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
963             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
964             velec            = _mm_mul_ps(qq02,VV);
965             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
966             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
967
968             /* Update potential sum for this i atom from the interaction with this j atom. */
969             velec            = _mm_andnot_ps(dummy_mask,velec);
970             velecsum         = _mm_add_ps(velecsum,velec);
971
972             fscal            = felec;
973
974             fscal            = _mm_andnot_ps(dummy_mask,fscal);
975
976             /* Calculate temporary vectorial force */
977             tx               = _mm_mul_ps(fscal,dx02);
978             ty               = _mm_mul_ps(fscal,dy02);
979             tz               = _mm_mul_ps(fscal,dz02);
980
981             /* Update vectorial force */
982             fix0             = _mm_add_ps(fix0,tx);
983             fiy0             = _mm_add_ps(fiy0,ty);
984             fiz0             = _mm_add_ps(fiz0,tz);
985
986             fjx2             = _mm_add_ps(fjx2,tx);
987             fjy2             = _mm_add_ps(fjy2,ty);
988             fjz2             = _mm_add_ps(fjz2,tz);
989             
990             /**************************
991              * CALCULATE INTERACTIONS *
992              **************************/
993
994             r10              = _mm_mul_ps(rsq10,rinv10);
995             r10              = _mm_andnot_ps(dummy_mask,r10);
996
997             /* Calculate table index by multiplying r with table scale and truncate to integer */
998             rt               = _mm_mul_ps(r10,vftabscale);
999             vfitab           = _mm_cvttps_epi32(rt);
1000             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1001             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1002
1003             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1004             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1005             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1006             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1007             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1008             _MM_TRANSPOSE4_PS(Y,F,G,H);
1009             Heps             = _mm_mul_ps(vfeps,H);
1010             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1011             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1012             velec            = _mm_mul_ps(qq10,VV);
1013             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1014             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1015
1016             /* Update potential sum for this i atom from the interaction with this j atom. */
1017             velec            = _mm_andnot_ps(dummy_mask,velec);
1018             velecsum         = _mm_add_ps(velecsum,velec);
1019
1020             fscal            = felec;
1021
1022             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1023
1024             /* Calculate temporary vectorial force */
1025             tx               = _mm_mul_ps(fscal,dx10);
1026             ty               = _mm_mul_ps(fscal,dy10);
1027             tz               = _mm_mul_ps(fscal,dz10);
1028
1029             /* Update vectorial force */
1030             fix1             = _mm_add_ps(fix1,tx);
1031             fiy1             = _mm_add_ps(fiy1,ty);
1032             fiz1             = _mm_add_ps(fiz1,tz);
1033
1034             fjx0             = _mm_add_ps(fjx0,tx);
1035             fjy0             = _mm_add_ps(fjy0,ty);
1036             fjz0             = _mm_add_ps(fjz0,tz);
1037             
1038             /**************************
1039              * CALCULATE INTERACTIONS *
1040              **************************/
1041
1042             r11              = _mm_mul_ps(rsq11,rinv11);
1043             r11              = _mm_andnot_ps(dummy_mask,r11);
1044
1045             /* Calculate table index by multiplying r with table scale and truncate to integer */
1046             rt               = _mm_mul_ps(r11,vftabscale);
1047             vfitab           = _mm_cvttps_epi32(rt);
1048             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1049             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1050
1051             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1052             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1053             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1054             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1055             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1056             _MM_TRANSPOSE4_PS(Y,F,G,H);
1057             Heps             = _mm_mul_ps(vfeps,H);
1058             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1059             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1060             velec            = _mm_mul_ps(qq11,VV);
1061             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1062             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1063
1064             /* Update potential sum for this i atom from the interaction with this j atom. */
1065             velec            = _mm_andnot_ps(dummy_mask,velec);
1066             velecsum         = _mm_add_ps(velecsum,velec);
1067
1068             fscal            = felec;
1069
1070             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1071
1072             /* Calculate temporary vectorial force */
1073             tx               = _mm_mul_ps(fscal,dx11);
1074             ty               = _mm_mul_ps(fscal,dy11);
1075             tz               = _mm_mul_ps(fscal,dz11);
1076
1077             /* Update vectorial force */
1078             fix1             = _mm_add_ps(fix1,tx);
1079             fiy1             = _mm_add_ps(fiy1,ty);
1080             fiz1             = _mm_add_ps(fiz1,tz);
1081
1082             fjx1             = _mm_add_ps(fjx1,tx);
1083             fjy1             = _mm_add_ps(fjy1,ty);
1084             fjz1             = _mm_add_ps(fjz1,tz);
1085             
1086             /**************************
1087              * CALCULATE INTERACTIONS *
1088              **************************/
1089
1090             r12              = _mm_mul_ps(rsq12,rinv12);
1091             r12              = _mm_andnot_ps(dummy_mask,r12);
1092
1093             /* Calculate table index by multiplying r with table scale and truncate to integer */
1094             rt               = _mm_mul_ps(r12,vftabscale);
1095             vfitab           = _mm_cvttps_epi32(rt);
1096             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1097             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1098
1099             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1100             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1101             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1102             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1103             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1104             _MM_TRANSPOSE4_PS(Y,F,G,H);
1105             Heps             = _mm_mul_ps(vfeps,H);
1106             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1107             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1108             velec            = _mm_mul_ps(qq12,VV);
1109             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1110             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1111
1112             /* Update potential sum for this i atom from the interaction with this j atom. */
1113             velec            = _mm_andnot_ps(dummy_mask,velec);
1114             velecsum         = _mm_add_ps(velecsum,velec);
1115
1116             fscal            = felec;
1117
1118             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1119
1120             /* Calculate temporary vectorial force */
1121             tx               = _mm_mul_ps(fscal,dx12);
1122             ty               = _mm_mul_ps(fscal,dy12);
1123             tz               = _mm_mul_ps(fscal,dz12);
1124
1125             /* Update vectorial force */
1126             fix1             = _mm_add_ps(fix1,tx);
1127             fiy1             = _mm_add_ps(fiy1,ty);
1128             fiz1             = _mm_add_ps(fiz1,tz);
1129
1130             fjx2             = _mm_add_ps(fjx2,tx);
1131             fjy2             = _mm_add_ps(fjy2,ty);
1132             fjz2             = _mm_add_ps(fjz2,tz);
1133             
1134             /**************************
1135              * CALCULATE INTERACTIONS *
1136              **************************/
1137
1138             r20              = _mm_mul_ps(rsq20,rinv20);
1139             r20              = _mm_andnot_ps(dummy_mask,r20);
1140
1141             /* Calculate table index by multiplying r with table scale and truncate to integer */
1142             rt               = _mm_mul_ps(r20,vftabscale);
1143             vfitab           = _mm_cvttps_epi32(rt);
1144             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1145             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1146
1147             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1148             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1149             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1150             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1151             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1152             _MM_TRANSPOSE4_PS(Y,F,G,H);
1153             Heps             = _mm_mul_ps(vfeps,H);
1154             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1155             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1156             velec            = _mm_mul_ps(qq20,VV);
1157             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1158             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1159
1160             /* Update potential sum for this i atom from the interaction with this j atom. */
1161             velec            = _mm_andnot_ps(dummy_mask,velec);
1162             velecsum         = _mm_add_ps(velecsum,velec);
1163
1164             fscal            = felec;
1165
1166             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1167
1168             /* Calculate temporary vectorial force */
1169             tx               = _mm_mul_ps(fscal,dx20);
1170             ty               = _mm_mul_ps(fscal,dy20);
1171             tz               = _mm_mul_ps(fscal,dz20);
1172
1173             /* Update vectorial force */
1174             fix2             = _mm_add_ps(fix2,tx);
1175             fiy2             = _mm_add_ps(fiy2,ty);
1176             fiz2             = _mm_add_ps(fiz2,tz);
1177
1178             fjx0             = _mm_add_ps(fjx0,tx);
1179             fjy0             = _mm_add_ps(fjy0,ty);
1180             fjz0             = _mm_add_ps(fjz0,tz);
1181             
1182             /**************************
1183              * CALCULATE INTERACTIONS *
1184              **************************/
1185
1186             r21              = _mm_mul_ps(rsq21,rinv21);
1187             r21              = _mm_andnot_ps(dummy_mask,r21);
1188
1189             /* Calculate table index by multiplying r with table scale and truncate to integer */
1190             rt               = _mm_mul_ps(r21,vftabscale);
1191             vfitab           = _mm_cvttps_epi32(rt);
1192             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1193             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1194
1195             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1196             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1197             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1198             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1199             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1200             _MM_TRANSPOSE4_PS(Y,F,G,H);
1201             Heps             = _mm_mul_ps(vfeps,H);
1202             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1203             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1204             velec            = _mm_mul_ps(qq21,VV);
1205             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1206             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1207
1208             /* Update potential sum for this i atom from the interaction with this j atom. */
1209             velec            = _mm_andnot_ps(dummy_mask,velec);
1210             velecsum         = _mm_add_ps(velecsum,velec);
1211
1212             fscal            = felec;
1213
1214             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1215
1216             /* Calculate temporary vectorial force */
1217             tx               = _mm_mul_ps(fscal,dx21);
1218             ty               = _mm_mul_ps(fscal,dy21);
1219             tz               = _mm_mul_ps(fscal,dz21);
1220
1221             /* Update vectorial force */
1222             fix2             = _mm_add_ps(fix2,tx);
1223             fiy2             = _mm_add_ps(fiy2,ty);
1224             fiz2             = _mm_add_ps(fiz2,tz);
1225
1226             fjx1             = _mm_add_ps(fjx1,tx);
1227             fjy1             = _mm_add_ps(fjy1,ty);
1228             fjz1             = _mm_add_ps(fjz1,tz);
1229             
1230             /**************************
1231              * CALCULATE INTERACTIONS *
1232              **************************/
1233
1234             r22              = _mm_mul_ps(rsq22,rinv22);
1235             r22              = _mm_andnot_ps(dummy_mask,r22);
1236
1237             /* Calculate table index by multiplying r with table scale and truncate to integer */
1238             rt               = _mm_mul_ps(r22,vftabscale);
1239             vfitab           = _mm_cvttps_epi32(rt);
1240             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1241             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1242
1243             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1244             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1245             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1246             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1247             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1248             _MM_TRANSPOSE4_PS(Y,F,G,H);
1249             Heps             = _mm_mul_ps(vfeps,H);
1250             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1251             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1252             velec            = _mm_mul_ps(qq22,VV);
1253             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1254             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1255
1256             /* Update potential sum for this i atom from the interaction with this j atom. */
1257             velec            = _mm_andnot_ps(dummy_mask,velec);
1258             velecsum         = _mm_add_ps(velecsum,velec);
1259
1260             fscal            = felec;
1261
1262             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1263
1264             /* Calculate temporary vectorial force */
1265             tx               = _mm_mul_ps(fscal,dx22);
1266             ty               = _mm_mul_ps(fscal,dy22);
1267             tz               = _mm_mul_ps(fscal,dz22);
1268
1269             /* Update vectorial force */
1270             fix2             = _mm_add_ps(fix2,tx);
1271             fiy2             = _mm_add_ps(fiy2,ty);
1272             fiz2             = _mm_add_ps(fiz2,tz);
1273
1274             fjx2             = _mm_add_ps(fjx2,tx);
1275             fjy2             = _mm_add_ps(fjy2,ty);
1276             fjz2             = _mm_add_ps(fjz2,tz);
1277             
1278             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1279             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1280             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1281             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1282
1283             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1284                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1285
1286             /* Inner loop uses 426 flops */
1287         }
1288
1289         /* End of innermost loop */
1290
1291         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1292                                               f+i_coord_offset,fshift+i_shift_offset);
1293
1294         ggid                        = gid[iidx];
1295         /* Update potential energies */
1296         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1297         gmx_mm_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1298
1299         /* Increment number of inner iterations */
1300         inneriter                  += j_index_end - j_index_start;
1301
1302         /* Outer loop uses 20 flops */
1303     }
1304
1305     /* Increment number of outer iterations */
1306     outeriter        += nri;
1307
1308     /* Update outer/inner flops */
1309
1310     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*426);
1311 }
1312 /*
1313  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
1314  * Electrostatics interaction: CubicSplineTable
1315  * VdW interaction:            CubicSplineTable
1316  * Geometry:                   Water3-Water3
1317  * Calculate force/pot:        Force
1318  */
1319 void
1320 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_sse2_single
1321                     (t_nblist                    * gmx_restrict       nlist,
1322                      rvec                        * gmx_restrict          xx,
1323                      rvec                        * gmx_restrict          ff,
1324                      t_forcerec                  * gmx_restrict          fr,
1325                      t_mdatoms                   * gmx_restrict     mdatoms,
1326                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1327                      t_nrnb                      * gmx_restrict        nrnb)
1328 {
1329     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1330      * just 0 for non-waters.
1331      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1332      * jnr indices corresponding to data put in the four positions in the SIMD register.
1333      */
1334     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1335     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1336     int              jnrA,jnrB,jnrC,jnrD;
1337     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1338     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1339     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1340     real             rcutoff_scalar;
1341     real             *shiftvec,*fshift,*x,*f;
1342     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1343     real             scratch[4*DIM];
1344     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1345     int              vdwioffset0;
1346     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1347     int              vdwioffset1;
1348     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1349     int              vdwioffset2;
1350     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1351     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1352     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1353     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1354     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1355     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1356     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1357     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1358     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1359     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1360     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1361     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1362     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1363     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1364     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1365     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1366     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1367     real             *charge;
1368     int              nvdwtype;
1369     __m128           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1370     int              *vdwtype;
1371     real             *vdwparam;
1372     __m128           one_sixth   = _mm_set1_ps(1.0/6.0);
1373     __m128           one_twelfth = _mm_set1_ps(1.0/12.0);
1374     __m128i          vfitab;
1375     __m128i          ifour       = _mm_set1_epi32(4);
1376     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1377     real             *vftab;
1378     __m128           dummy_mask,cutoff_mask;
1379     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1380     __m128           one     = _mm_set1_ps(1.0);
1381     __m128           two     = _mm_set1_ps(2.0);
1382     x                = xx[0];
1383     f                = ff[0];
1384
1385     nri              = nlist->nri;
1386     iinr             = nlist->iinr;
1387     jindex           = nlist->jindex;
1388     jjnr             = nlist->jjnr;
1389     shiftidx         = nlist->shift;
1390     gid              = nlist->gid;
1391     shiftvec         = fr->shift_vec[0];
1392     fshift           = fr->fshift[0];
1393     facel            = _mm_set1_ps(fr->epsfac);
1394     charge           = mdatoms->chargeA;
1395     nvdwtype         = fr->ntype;
1396     vdwparam         = fr->nbfp;
1397     vdwtype          = mdatoms->typeA;
1398
1399     vftab            = kernel_data->table_elec_vdw->data;
1400     vftabscale       = _mm_set1_ps(kernel_data->table_elec_vdw->scale);
1401
1402     /* Setup water-specific parameters */
1403     inr              = nlist->iinr[0];
1404     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1405     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1406     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1407     vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];
1408
1409     jq0              = _mm_set1_ps(charge[inr+0]);
1410     jq1              = _mm_set1_ps(charge[inr+1]);
1411     jq2              = _mm_set1_ps(charge[inr+2]);
1412     vdwjidx0A        = 2*vdwtype[inr+0];
1413     qq00             = _mm_mul_ps(iq0,jq0);
1414     c6_00            = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A]);
1415     c12_00           = _mm_set1_ps(vdwparam[vdwioffset0+vdwjidx0A+1]);
1416     qq01             = _mm_mul_ps(iq0,jq1);
1417     qq02             = _mm_mul_ps(iq0,jq2);
1418     qq10             = _mm_mul_ps(iq1,jq0);
1419     qq11             = _mm_mul_ps(iq1,jq1);
1420     qq12             = _mm_mul_ps(iq1,jq2);
1421     qq20             = _mm_mul_ps(iq2,jq0);
1422     qq21             = _mm_mul_ps(iq2,jq1);
1423     qq22             = _mm_mul_ps(iq2,jq2);
1424
1425     /* Avoid stupid compiler warnings */
1426     jnrA = jnrB = jnrC = jnrD = 0;
1427     j_coord_offsetA = 0;
1428     j_coord_offsetB = 0;
1429     j_coord_offsetC = 0;
1430     j_coord_offsetD = 0;
1431
1432     outeriter        = 0;
1433     inneriter        = 0;
1434
1435     for(iidx=0;iidx<4*DIM;iidx++)
1436     {
1437         scratch[iidx] = 0.0;
1438     }  
1439
1440     /* Start outer loop over neighborlists */
1441     for(iidx=0; iidx<nri; iidx++)
1442     {
1443         /* Load shift vector for this list */
1444         i_shift_offset   = DIM*shiftidx[iidx];
1445
1446         /* Load limits for loop over neighbors */
1447         j_index_start    = jindex[iidx];
1448         j_index_end      = jindex[iidx+1];
1449
1450         /* Get outer coordinate index */
1451         inr              = iinr[iidx];
1452         i_coord_offset   = DIM*inr;
1453
1454         /* Load i particle coords and add shift vector */
1455         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1456                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1457         
1458         fix0             = _mm_setzero_ps();
1459         fiy0             = _mm_setzero_ps();
1460         fiz0             = _mm_setzero_ps();
1461         fix1             = _mm_setzero_ps();
1462         fiy1             = _mm_setzero_ps();
1463         fiz1             = _mm_setzero_ps();
1464         fix2             = _mm_setzero_ps();
1465         fiy2             = _mm_setzero_ps();
1466         fiz2             = _mm_setzero_ps();
1467
1468         /* Start inner kernel loop */
1469         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1470         {
1471
1472             /* Get j neighbor index, and coordinate index */
1473             jnrA             = jjnr[jidx];
1474             jnrB             = jjnr[jidx+1];
1475             jnrC             = jjnr[jidx+2];
1476             jnrD             = jjnr[jidx+3];
1477             j_coord_offsetA  = DIM*jnrA;
1478             j_coord_offsetB  = DIM*jnrB;
1479             j_coord_offsetC  = DIM*jnrC;
1480             j_coord_offsetD  = DIM*jnrD;
1481
1482             /* load j atom coordinates */
1483             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1484                                               x+j_coord_offsetC,x+j_coord_offsetD,
1485                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1486
1487             /* Calculate displacement vector */
1488             dx00             = _mm_sub_ps(ix0,jx0);
1489             dy00             = _mm_sub_ps(iy0,jy0);
1490             dz00             = _mm_sub_ps(iz0,jz0);
1491             dx01             = _mm_sub_ps(ix0,jx1);
1492             dy01             = _mm_sub_ps(iy0,jy1);
1493             dz01             = _mm_sub_ps(iz0,jz1);
1494             dx02             = _mm_sub_ps(ix0,jx2);
1495             dy02             = _mm_sub_ps(iy0,jy2);
1496             dz02             = _mm_sub_ps(iz0,jz2);
1497             dx10             = _mm_sub_ps(ix1,jx0);
1498             dy10             = _mm_sub_ps(iy1,jy0);
1499             dz10             = _mm_sub_ps(iz1,jz0);
1500             dx11             = _mm_sub_ps(ix1,jx1);
1501             dy11             = _mm_sub_ps(iy1,jy1);
1502             dz11             = _mm_sub_ps(iz1,jz1);
1503             dx12             = _mm_sub_ps(ix1,jx2);
1504             dy12             = _mm_sub_ps(iy1,jy2);
1505             dz12             = _mm_sub_ps(iz1,jz2);
1506             dx20             = _mm_sub_ps(ix2,jx0);
1507             dy20             = _mm_sub_ps(iy2,jy0);
1508             dz20             = _mm_sub_ps(iz2,jz0);
1509             dx21             = _mm_sub_ps(ix2,jx1);
1510             dy21             = _mm_sub_ps(iy2,jy1);
1511             dz21             = _mm_sub_ps(iz2,jz1);
1512             dx22             = _mm_sub_ps(ix2,jx2);
1513             dy22             = _mm_sub_ps(iy2,jy2);
1514             dz22             = _mm_sub_ps(iz2,jz2);
1515
1516             /* Calculate squared distance and things based on it */
1517             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1518             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1519             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1520             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1521             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1522             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1523             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1524             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1525             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1526
1527             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1528             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1529             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1530             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1531             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1532             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1533             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1534             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1535             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1536
1537             fjx0             = _mm_setzero_ps();
1538             fjy0             = _mm_setzero_ps();
1539             fjz0             = _mm_setzero_ps();
1540             fjx1             = _mm_setzero_ps();
1541             fjy1             = _mm_setzero_ps();
1542             fjz1             = _mm_setzero_ps();
1543             fjx2             = _mm_setzero_ps();
1544             fjy2             = _mm_setzero_ps();
1545             fjz2             = _mm_setzero_ps();
1546
1547             /**************************
1548              * CALCULATE INTERACTIONS *
1549              **************************/
1550
1551             r00              = _mm_mul_ps(rsq00,rinv00);
1552
1553             /* Calculate table index by multiplying r with table scale and truncate to integer */
1554             rt               = _mm_mul_ps(r00,vftabscale);
1555             vfitab           = _mm_cvttps_epi32(rt);
1556             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1557             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1558
1559             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1560             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1561             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1562             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1563             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1564             _MM_TRANSPOSE4_PS(Y,F,G,H);
1565             Heps             = _mm_mul_ps(vfeps,H);
1566             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1567             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1568             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1569
1570             /* CUBIC SPLINE TABLE DISPERSION */
1571             vfitab           = _mm_add_epi32(vfitab,ifour);
1572             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1573             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1574             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1575             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1576             _MM_TRANSPOSE4_PS(Y,F,G,H);
1577             Heps             = _mm_mul_ps(vfeps,H);
1578             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1579             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1580             fvdw6            = _mm_mul_ps(c6_00,FF);
1581
1582             /* CUBIC SPLINE TABLE REPULSION */
1583             vfitab           = _mm_add_epi32(vfitab,ifour);
1584             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1585             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1586             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1587             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1588             _MM_TRANSPOSE4_PS(Y,F,G,H);
1589             Heps             = _mm_mul_ps(vfeps,H);
1590             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1591             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1592             fvdw12           = _mm_mul_ps(c12_00,FF);
1593             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
1594
1595             fscal            = _mm_add_ps(felec,fvdw);
1596
1597             /* Calculate temporary vectorial force */
1598             tx               = _mm_mul_ps(fscal,dx00);
1599             ty               = _mm_mul_ps(fscal,dy00);
1600             tz               = _mm_mul_ps(fscal,dz00);
1601
1602             /* Update vectorial force */
1603             fix0             = _mm_add_ps(fix0,tx);
1604             fiy0             = _mm_add_ps(fiy0,ty);
1605             fiz0             = _mm_add_ps(fiz0,tz);
1606
1607             fjx0             = _mm_add_ps(fjx0,tx);
1608             fjy0             = _mm_add_ps(fjy0,ty);
1609             fjz0             = _mm_add_ps(fjz0,tz);
1610             
1611             /**************************
1612              * CALCULATE INTERACTIONS *
1613              **************************/
1614
1615             r01              = _mm_mul_ps(rsq01,rinv01);
1616
1617             /* Calculate table index by multiplying r with table scale and truncate to integer */
1618             rt               = _mm_mul_ps(r01,vftabscale);
1619             vfitab           = _mm_cvttps_epi32(rt);
1620             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1621             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1622
1623             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1624             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1625             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1626             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1627             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1628             _MM_TRANSPOSE4_PS(Y,F,G,H);
1629             Heps             = _mm_mul_ps(vfeps,H);
1630             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1631             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1632             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1633
1634             fscal            = felec;
1635
1636             /* Calculate temporary vectorial force */
1637             tx               = _mm_mul_ps(fscal,dx01);
1638             ty               = _mm_mul_ps(fscal,dy01);
1639             tz               = _mm_mul_ps(fscal,dz01);
1640
1641             /* Update vectorial force */
1642             fix0             = _mm_add_ps(fix0,tx);
1643             fiy0             = _mm_add_ps(fiy0,ty);
1644             fiz0             = _mm_add_ps(fiz0,tz);
1645
1646             fjx1             = _mm_add_ps(fjx1,tx);
1647             fjy1             = _mm_add_ps(fjy1,ty);
1648             fjz1             = _mm_add_ps(fjz1,tz);
1649             
1650             /**************************
1651              * CALCULATE INTERACTIONS *
1652              **************************/
1653
1654             r02              = _mm_mul_ps(rsq02,rinv02);
1655
1656             /* Calculate table index by multiplying r with table scale and truncate to integer */
1657             rt               = _mm_mul_ps(r02,vftabscale);
1658             vfitab           = _mm_cvttps_epi32(rt);
1659             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1660             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1661
1662             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1663             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1664             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1665             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1666             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1667             _MM_TRANSPOSE4_PS(Y,F,G,H);
1668             Heps             = _mm_mul_ps(vfeps,H);
1669             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1670             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1671             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1672
1673             fscal            = felec;
1674
1675             /* Calculate temporary vectorial force */
1676             tx               = _mm_mul_ps(fscal,dx02);
1677             ty               = _mm_mul_ps(fscal,dy02);
1678             tz               = _mm_mul_ps(fscal,dz02);
1679
1680             /* Update vectorial force */
1681             fix0             = _mm_add_ps(fix0,tx);
1682             fiy0             = _mm_add_ps(fiy0,ty);
1683             fiz0             = _mm_add_ps(fiz0,tz);
1684
1685             fjx2             = _mm_add_ps(fjx2,tx);
1686             fjy2             = _mm_add_ps(fjy2,ty);
1687             fjz2             = _mm_add_ps(fjz2,tz);
1688             
1689             /**************************
1690              * CALCULATE INTERACTIONS *
1691              **************************/
1692
1693             r10              = _mm_mul_ps(rsq10,rinv10);
1694
1695             /* Calculate table index by multiplying r with table scale and truncate to integer */
1696             rt               = _mm_mul_ps(r10,vftabscale);
1697             vfitab           = _mm_cvttps_epi32(rt);
1698             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1699             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1700
1701             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1702             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1703             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1704             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1705             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1706             _MM_TRANSPOSE4_PS(Y,F,G,H);
1707             Heps             = _mm_mul_ps(vfeps,H);
1708             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1709             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1710             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1711
1712             fscal            = felec;
1713
1714             /* Calculate temporary vectorial force */
1715             tx               = _mm_mul_ps(fscal,dx10);
1716             ty               = _mm_mul_ps(fscal,dy10);
1717             tz               = _mm_mul_ps(fscal,dz10);
1718
1719             /* Update vectorial force */
1720             fix1             = _mm_add_ps(fix1,tx);
1721             fiy1             = _mm_add_ps(fiy1,ty);
1722             fiz1             = _mm_add_ps(fiz1,tz);
1723
1724             fjx0             = _mm_add_ps(fjx0,tx);
1725             fjy0             = _mm_add_ps(fjy0,ty);
1726             fjz0             = _mm_add_ps(fjz0,tz);
1727             
1728             /**************************
1729              * CALCULATE INTERACTIONS *
1730              **************************/
1731
1732             r11              = _mm_mul_ps(rsq11,rinv11);
1733
1734             /* Calculate table index by multiplying r with table scale and truncate to integer */
1735             rt               = _mm_mul_ps(r11,vftabscale);
1736             vfitab           = _mm_cvttps_epi32(rt);
1737             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1738             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1739
1740             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1741             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1742             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1743             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1744             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1745             _MM_TRANSPOSE4_PS(Y,F,G,H);
1746             Heps             = _mm_mul_ps(vfeps,H);
1747             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1748             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1749             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1750
1751             fscal            = felec;
1752
1753             /* Calculate temporary vectorial force */
1754             tx               = _mm_mul_ps(fscal,dx11);
1755             ty               = _mm_mul_ps(fscal,dy11);
1756             tz               = _mm_mul_ps(fscal,dz11);
1757
1758             /* Update vectorial force */
1759             fix1             = _mm_add_ps(fix1,tx);
1760             fiy1             = _mm_add_ps(fiy1,ty);
1761             fiz1             = _mm_add_ps(fiz1,tz);
1762
1763             fjx1             = _mm_add_ps(fjx1,tx);
1764             fjy1             = _mm_add_ps(fjy1,ty);
1765             fjz1             = _mm_add_ps(fjz1,tz);
1766             
1767             /**************************
1768              * CALCULATE INTERACTIONS *
1769              **************************/
1770
1771             r12              = _mm_mul_ps(rsq12,rinv12);
1772
1773             /* Calculate table index by multiplying r with table scale and truncate to integer */
1774             rt               = _mm_mul_ps(r12,vftabscale);
1775             vfitab           = _mm_cvttps_epi32(rt);
1776             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1777             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1778
1779             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1780             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1781             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1782             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1783             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1784             _MM_TRANSPOSE4_PS(Y,F,G,H);
1785             Heps             = _mm_mul_ps(vfeps,H);
1786             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1787             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1788             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1789
1790             fscal            = felec;
1791
1792             /* Calculate temporary vectorial force */
1793             tx               = _mm_mul_ps(fscal,dx12);
1794             ty               = _mm_mul_ps(fscal,dy12);
1795             tz               = _mm_mul_ps(fscal,dz12);
1796
1797             /* Update vectorial force */
1798             fix1             = _mm_add_ps(fix1,tx);
1799             fiy1             = _mm_add_ps(fiy1,ty);
1800             fiz1             = _mm_add_ps(fiz1,tz);
1801
1802             fjx2             = _mm_add_ps(fjx2,tx);
1803             fjy2             = _mm_add_ps(fjy2,ty);
1804             fjz2             = _mm_add_ps(fjz2,tz);
1805             
1806             /**************************
1807              * CALCULATE INTERACTIONS *
1808              **************************/
1809
1810             r20              = _mm_mul_ps(rsq20,rinv20);
1811
1812             /* Calculate table index by multiplying r with table scale and truncate to integer */
1813             rt               = _mm_mul_ps(r20,vftabscale);
1814             vfitab           = _mm_cvttps_epi32(rt);
1815             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1816             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1817
1818             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1819             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1820             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1821             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1822             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1823             _MM_TRANSPOSE4_PS(Y,F,G,H);
1824             Heps             = _mm_mul_ps(vfeps,H);
1825             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1826             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1827             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1828
1829             fscal            = felec;
1830
1831             /* Calculate temporary vectorial force */
1832             tx               = _mm_mul_ps(fscal,dx20);
1833             ty               = _mm_mul_ps(fscal,dy20);
1834             tz               = _mm_mul_ps(fscal,dz20);
1835
1836             /* Update vectorial force */
1837             fix2             = _mm_add_ps(fix2,tx);
1838             fiy2             = _mm_add_ps(fiy2,ty);
1839             fiz2             = _mm_add_ps(fiz2,tz);
1840
1841             fjx0             = _mm_add_ps(fjx0,tx);
1842             fjy0             = _mm_add_ps(fjy0,ty);
1843             fjz0             = _mm_add_ps(fjz0,tz);
1844             
1845             /**************************
1846              * CALCULATE INTERACTIONS *
1847              **************************/
1848
1849             r21              = _mm_mul_ps(rsq21,rinv21);
1850
1851             /* Calculate table index by multiplying r with table scale and truncate to integer */
1852             rt               = _mm_mul_ps(r21,vftabscale);
1853             vfitab           = _mm_cvttps_epi32(rt);
1854             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1855             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1856
1857             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1858             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1859             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1860             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1861             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1862             _MM_TRANSPOSE4_PS(Y,F,G,H);
1863             Heps             = _mm_mul_ps(vfeps,H);
1864             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1865             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1866             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1867
1868             fscal            = felec;
1869
1870             /* Calculate temporary vectorial force */
1871             tx               = _mm_mul_ps(fscal,dx21);
1872             ty               = _mm_mul_ps(fscal,dy21);
1873             tz               = _mm_mul_ps(fscal,dz21);
1874
1875             /* Update vectorial force */
1876             fix2             = _mm_add_ps(fix2,tx);
1877             fiy2             = _mm_add_ps(fiy2,ty);
1878             fiz2             = _mm_add_ps(fiz2,tz);
1879
1880             fjx1             = _mm_add_ps(fjx1,tx);
1881             fjy1             = _mm_add_ps(fjy1,ty);
1882             fjz1             = _mm_add_ps(fjz1,tz);
1883             
1884             /**************************
1885              * CALCULATE INTERACTIONS *
1886              **************************/
1887
1888             r22              = _mm_mul_ps(rsq22,rinv22);
1889
1890             /* Calculate table index by multiplying r with table scale and truncate to integer */
1891             rt               = _mm_mul_ps(r22,vftabscale);
1892             vfitab           = _mm_cvttps_epi32(rt);
1893             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1894             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
1895
1896             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1897             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1898             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1899             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1900             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1901             _MM_TRANSPOSE4_PS(Y,F,G,H);
1902             Heps             = _mm_mul_ps(vfeps,H);
1903             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1904             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1905             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1906
1907             fscal            = felec;
1908
1909             /* Calculate temporary vectorial force */
1910             tx               = _mm_mul_ps(fscal,dx22);
1911             ty               = _mm_mul_ps(fscal,dy22);
1912             tz               = _mm_mul_ps(fscal,dz22);
1913
1914             /* Update vectorial force */
1915             fix2             = _mm_add_ps(fix2,tx);
1916             fiy2             = _mm_add_ps(fiy2,ty);
1917             fiz2             = _mm_add_ps(fiz2,tz);
1918
1919             fjx2             = _mm_add_ps(fjx2,tx);
1920             fjy2             = _mm_add_ps(fjy2,ty);
1921             fjz2             = _mm_add_ps(fjz2,tz);
1922             
1923             fjptrA             = f+j_coord_offsetA;
1924             fjptrB             = f+j_coord_offsetB;
1925             fjptrC             = f+j_coord_offsetC;
1926             fjptrD             = f+j_coord_offsetD;
1927
1928             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1929                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1930
1931             /* Inner loop uses 373 flops */
1932         }
1933
1934         if(jidx<j_index_end)
1935         {
1936
1937             /* Get j neighbor index, and coordinate index */
1938             jnrlistA         = jjnr[jidx];
1939             jnrlistB         = jjnr[jidx+1];
1940             jnrlistC         = jjnr[jidx+2];
1941             jnrlistD         = jjnr[jidx+3];
1942             /* Sign of each element will be negative for non-real atoms.
1943              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1944              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1945              */
1946             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1947             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1948             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1949             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1950             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1951             j_coord_offsetA  = DIM*jnrA;
1952             j_coord_offsetB  = DIM*jnrB;
1953             j_coord_offsetC  = DIM*jnrC;
1954             j_coord_offsetD  = DIM*jnrD;
1955
1956             /* load j atom coordinates */
1957             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1958                                               x+j_coord_offsetC,x+j_coord_offsetD,
1959                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1960
1961             /* Calculate displacement vector */
1962             dx00             = _mm_sub_ps(ix0,jx0);
1963             dy00             = _mm_sub_ps(iy0,jy0);
1964             dz00             = _mm_sub_ps(iz0,jz0);
1965             dx01             = _mm_sub_ps(ix0,jx1);
1966             dy01             = _mm_sub_ps(iy0,jy1);
1967             dz01             = _mm_sub_ps(iz0,jz1);
1968             dx02             = _mm_sub_ps(ix0,jx2);
1969             dy02             = _mm_sub_ps(iy0,jy2);
1970             dz02             = _mm_sub_ps(iz0,jz2);
1971             dx10             = _mm_sub_ps(ix1,jx0);
1972             dy10             = _mm_sub_ps(iy1,jy0);
1973             dz10             = _mm_sub_ps(iz1,jz0);
1974             dx11             = _mm_sub_ps(ix1,jx1);
1975             dy11             = _mm_sub_ps(iy1,jy1);
1976             dz11             = _mm_sub_ps(iz1,jz1);
1977             dx12             = _mm_sub_ps(ix1,jx2);
1978             dy12             = _mm_sub_ps(iy1,jy2);
1979             dz12             = _mm_sub_ps(iz1,jz2);
1980             dx20             = _mm_sub_ps(ix2,jx0);
1981             dy20             = _mm_sub_ps(iy2,jy0);
1982             dz20             = _mm_sub_ps(iz2,jz0);
1983             dx21             = _mm_sub_ps(ix2,jx1);
1984             dy21             = _mm_sub_ps(iy2,jy1);
1985             dz21             = _mm_sub_ps(iz2,jz1);
1986             dx22             = _mm_sub_ps(ix2,jx2);
1987             dy22             = _mm_sub_ps(iy2,jy2);
1988             dz22             = _mm_sub_ps(iz2,jz2);
1989
1990             /* Calculate squared distance and things based on it */
1991             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1992             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1993             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1994             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1995             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1996             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1997             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1998             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1999             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
2000
2001             rinv00           = gmx_mm_invsqrt_ps(rsq00);
2002             rinv01           = gmx_mm_invsqrt_ps(rsq01);
2003             rinv02           = gmx_mm_invsqrt_ps(rsq02);
2004             rinv10           = gmx_mm_invsqrt_ps(rsq10);
2005             rinv11           = gmx_mm_invsqrt_ps(rsq11);
2006             rinv12           = gmx_mm_invsqrt_ps(rsq12);
2007             rinv20           = gmx_mm_invsqrt_ps(rsq20);
2008             rinv21           = gmx_mm_invsqrt_ps(rsq21);
2009             rinv22           = gmx_mm_invsqrt_ps(rsq22);
2010
2011             fjx0             = _mm_setzero_ps();
2012             fjy0             = _mm_setzero_ps();
2013             fjz0             = _mm_setzero_ps();
2014             fjx1             = _mm_setzero_ps();
2015             fjy1             = _mm_setzero_ps();
2016             fjz1             = _mm_setzero_ps();
2017             fjx2             = _mm_setzero_ps();
2018             fjy2             = _mm_setzero_ps();
2019             fjz2             = _mm_setzero_ps();
2020
2021             /**************************
2022              * CALCULATE INTERACTIONS *
2023              **************************/
2024
2025             r00              = _mm_mul_ps(rsq00,rinv00);
2026             r00              = _mm_andnot_ps(dummy_mask,r00);
2027
2028             /* Calculate table index by multiplying r with table scale and truncate to integer */
2029             rt               = _mm_mul_ps(r00,vftabscale);
2030             vfitab           = _mm_cvttps_epi32(rt);
2031             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2032             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2033
2034             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2035             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2036             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2037             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2038             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2039             _MM_TRANSPOSE4_PS(Y,F,G,H);
2040             Heps             = _mm_mul_ps(vfeps,H);
2041             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2042             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2043             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
2044
2045             /* CUBIC SPLINE TABLE DISPERSION */
2046             vfitab           = _mm_add_epi32(vfitab,ifour);
2047             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2048             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2049             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2050             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2051             _MM_TRANSPOSE4_PS(Y,F,G,H);
2052             Heps             = _mm_mul_ps(vfeps,H);
2053             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2054             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2055             fvdw6            = _mm_mul_ps(c6_00,FF);
2056
2057             /* CUBIC SPLINE TABLE REPULSION */
2058             vfitab           = _mm_add_epi32(vfitab,ifour);
2059             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2060             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2061             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2062             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2063             _MM_TRANSPOSE4_PS(Y,F,G,H);
2064             Heps             = _mm_mul_ps(vfeps,H);
2065             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2066             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2067             fvdw12           = _mm_mul_ps(c12_00,FF);
2068             fvdw             = _mm_xor_ps(signbit,_mm_mul_ps(_mm_add_ps(fvdw6,fvdw12),_mm_mul_ps(vftabscale,rinv00)));
2069
2070             fscal            = _mm_add_ps(felec,fvdw);
2071
2072             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2073
2074             /* Calculate temporary vectorial force */
2075             tx               = _mm_mul_ps(fscal,dx00);
2076             ty               = _mm_mul_ps(fscal,dy00);
2077             tz               = _mm_mul_ps(fscal,dz00);
2078
2079             /* Update vectorial force */
2080             fix0             = _mm_add_ps(fix0,tx);
2081             fiy0             = _mm_add_ps(fiy0,ty);
2082             fiz0             = _mm_add_ps(fiz0,tz);
2083
2084             fjx0             = _mm_add_ps(fjx0,tx);
2085             fjy0             = _mm_add_ps(fjy0,ty);
2086             fjz0             = _mm_add_ps(fjz0,tz);
2087             
2088             /**************************
2089              * CALCULATE INTERACTIONS *
2090              **************************/
2091
2092             r01              = _mm_mul_ps(rsq01,rinv01);
2093             r01              = _mm_andnot_ps(dummy_mask,r01);
2094
2095             /* Calculate table index by multiplying r with table scale and truncate to integer */
2096             rt               = _mm_mul_ps(r01,vftabscale);
2097             vfitab           = _mm_cvttps_epi32(rt);
2098             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2099             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2100
2101             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2102             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2103             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2104             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2105             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2106             _MM_TRANSPOSE4_PS(Y,F,G,H);
2107             Heps             = _mm_mul_ps(vfeps,H);
2108             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2109             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2110             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
2111
2112             fscal            = felec;
2113
2114             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2115
2116             /* Calculate temporary vectorial force */
2117             tx               = _mm_mul_ps(fscal,dx01);
2118             ty               = _mm_mul_ps(fscal,dy01);
2119             tz               = _mm_mul_ps(fscal,dz01);
2120
2121             /* Update vectorial force */
2122             fix0             = _mm_add_ps(fix0,tx);
2123             fiy0             = _mm_add_ps(fiy0,ty);
2124             fiz0             = _mm_add_ps(fiz0,tz);
2125
2126             fjx1             = _mm_add_ps(fjx1,tx);
2127             fjy1             = _mm_add_ps(fjy1,ty);
2128             fjz1             = _mm_add_ps(fjz1,tz);
2129             
2130             /**************************
2131              * CALCULATE INTERACTIONS *
2132              **************************/
2133
2134             r02              = _mm_mul_ps(rsq02,rinv02);
2135             r02              = _mm_andnot_ps(dummy_mask,r02);
2136
2137             /* Calculate table index by multiplying r with table scale and truncate to integer */
2138             rt               = _mm_mul_ps(r02,vftabscale);
2139             vfitab           = _mm_cvttps_epi32(rt);
2140             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2141             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2142
2143             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2144             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2145             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2146             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2147             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2148             _MM_TRANSPOSE4_PS(Y,F,G,H);
2149             Heps             = _mm_mul_ps(vfeps,H);
2150             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2151             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2152             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
2153
2154             fscal            = felec;
2155
2156             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2157
2158             /* Calculate temporary vectorial force */
2159             tx               = _mm_mul_ps(fscal,dx02);
2160             ty               = _mm_mul_ps(fscal,dy02);
2161             tz               = _mm_mul_ps(fscal,dz02);
2162
2163             /* Update vectorial force */
2164             fix0             = _mm_add_ps(fix0,tx);
2165             fiy0             = _mm_add_ps(fiy0,ty);
2166             fiz0             = _mm_add_ps(fiz0,tz);
2167
2168             fjx2             = _mm_add_ps(fjx2,tx);
2169             fjy2             = _mm_add_ps(fjy2,ty);
2170             fjz2             = _mm_add_ps(fjz2,tz);
2171             
2172             /**************************
2173              * CALCULATE INTERACTIONS *
2174              **************************/
2175
2176             r10              = _mm_mul_ps(rsq10,rinv10);
2177             r10              = _mm_andnot_ps(dummy_mask,r10);
2178
2179             /* Calculate table index by multiplying r with table scale and truncate to integer */
2180             rt               = _mm_mul_ps(r10,vftabscale);
2181             vfitab           = _mm_cvttps_epi32(rt);
2182             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2183             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2184
2185             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2186             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2187             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2188             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2189             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2190             _MM_TRANSPOSE4_PS(Y,F,G,H);
2191             Heps             = _mm_mul_ps(vfeps,H);
2192             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2193             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2194             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
2195
2196             fscal            = felec;
2197
2198             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2199
2200             /* Calculate temporary vectorial force */
2201             tx               = _mm_mul_ps(fscal,dx10);
2202             ty               = _mm_mul_ps(fscal,dy10);
2203             tz               = _mm_mul_ps(fscal,dz10);
2204
2205             /* Update vectorial force */
2206             fix1             = _mm_add_ps(fix1,tx);
2207             fiy1             = _mm_add_ps(fiy1,ty);
2208             fiz1             = _mm_add_ps(fiz1,tz);
2209
2210             fjx0             = _mm_add_ps(fjx0,tx);
2211             fjy0             = _mm_add_ps(fjy0,ty);
2212             fjz0             = _mm_add_ps(fjz0,tz);
2213             
2214             /**************************
2215              * CALCULATE INTERACTIONS *
2216              **************************/
2217
2218             r11              = _mm_mul_ps(rsq11,rinv11);
2219             r11              = _mm_andnot_ps(dummy_mask,r11);
2220
2221             /* Calculate table index by multiplying r with table scale and truncate to integer */
2222             rt               = _mm_mul_ps(r11,vftabscale);
2223             vfitab           = _mm_cvttps_epi32(rt);
2224             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2225             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2226
2227             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2228             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2229             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2230             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2231             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2232             _MM_TRANSPOSE4_PS(Y,F,G,H);
2233             Heps             = _mm_mul_ps(vfeps,H);
2234             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2235             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2236             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2237
2238             fscal            = felec;
2239
2240             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2241
2242             /* Calculate temporary vectorial force */
2243             tx               = _mm_mul_ps(fscal,dx11);
2244             ty               = _mm_mul_ps(fscal,dy11);
2245             tz               = _mm_mul_ps(fscal,dz11);
2246
2247             /* Update vectorial force */
2248             fix1             = _mm_add_ps(fix1,tx);
2249             fiy1             = _mm_add_ps(fiy1,ty);
2250             fiz1             = _mm_add_ps(fiz1,tz);
2251
2252             fjx1             = _mm_add_ps(fjx1,tx);
2253             fjy1             = _mm_add_ps(fjy1,ty);
2254             fjz1             = _mm_add_ps(fjz1,tz);
2255             
2256             /**************************
2257              * CALCULATE INTERACTIONS *
2258              **************************/
2259
2260             r12              = _mm_mul_ps(rsq12,rinv12);
2261             r12              = _mm_andnot_ps(dummy_mask,r12);
2262
2263             /* Calculate table index by multiplying r with table scale and truncate to integer */
2264             rt               = _mm_mul_ps(r12,vftabscale);
2265             vfitab           = _mm_cvttps_epi32(rt);
2266             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2267             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2268
2269             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2270             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2271             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2272             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2273             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2274             _MM_TRANSPOSE4_PS(Y,F,G,H);
2275             Heps             = _mm_mul_ps(vfeps,H);
2276             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2277             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2278             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2279
2280             fscal            = felec;
2281
2282             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2283
2284             /* Calculate temporary vectorial force */
2285             tx               = _mm_mul_ps(fscal,dx12);
2286             ty               = _mm_mul_ps(fscal,dy12);
2287             tz               = _mm_mul_ps(fscal,dz12);
2288
2289             /* Update vectorial force */
2290             fix1             = _mm_add_ps(fix1,tx);
2291             fiy1             = _mm_add_ps(fiy1,ty);
2292             fiz1             = _mm_add_ps(fiz1,tz);
2293
2294             fjx2             = _mm_add_ps(fjx2,tx);
2295             fjy2             = _mm_add_ps(fjy2,ty);
2296             fjz2             = _mm_add_ps(fjz2,tz);
2297             
2298             /**************************
2299              * CALCULATE INTERACTIONS *
2300              **************************/
2301
2302             r20              = _mm_mul_ps(rsq20,rinv20);
2303             r20              = _mm_andnot_ps(dummy_mask,r20);
2304
2305             /* Calculate table index by multiplying r with table scale and truncate to integer */
2306             rt               = _mm_mul_ps(r20,vftabscale);
2307             vfitab           = _mm_cvttps_epi32(rt);
2308             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2309             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2310
2311             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2312             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2313             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2314             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2315             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2316             _MM_TRANSPOSE4_PS(Y,F,G,H);
2317             Heps             = _mm_mul_ps(vfeps,H);
2318             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2319             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2320             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
2321
2322             fscal            = felec;
2323
2324             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2325
2326             /* Calculate temporary vectorial force */
2327             tx               = _mm_mul_ps(fscal,dx20);
2328             ty               = _mm_mul_ps(fscal,dy20);
2329             tz               = _mm_mul_ps(fscal,dz20);
2330
2331             /* Update vectorial force */
2332             fix2             = _mm_add_ps(fix2,tx);
2333             fiy2             = _mm_add_ps(fiy2,ty);
2334             fiz2             = _mm_add_ps(fiz2,tz);
2335
2336             fjx0             = _mm_add_ps(fjx0,tx);
2337             fjy0             = _mm_add_ps(fjy0,ty);
2338             fjz0             = _mm_add_ps(fjz0,tz);
2339             
2340             /**************************
2341              * CALCULATE INTERACTIONS *
2342              **************************/
2343
2344             r21              = _mm_mul_ps(rsq21,rinv21);
2345             r21              = _mm_andnot_ps(dummy_mask,r21);
2346
2347             /* Calculate table index by multiplying r with table scale and truncate to integer */
2348             rt               = _mm_mul_ps(r21,vftabscale);
2349             vfitab           = _mm_cvttps_epi32(rt);
2350             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2351             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2352
2353             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2354             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2355             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2356             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2357             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2358             _MM_TRANSPOSE4_PS(Y,F,G,H);
2359             Heps             = _mm_mul_ps(vfeps,H);
2360             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2361             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2362             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2363
2364             fscal            = felec;
2365
2366             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2367
2368             /* Calculate temporary vectorial force */
2369             tx               = _mm_mul_ps(fscal,dx21);
2370             ty               = _mm_mul_ps(fscal,dy21);
2371             tz               = _mm_mul_ps(fscal,dz21);
2372
2373             /* Update vectorial force */
2374             fix2             = _mm_add_ps(fix2,tx);
2375             fiy2             = _mm_add_ps(fiy2,ty);
2376             fiz2             = _mm_add_ps(fiz2,tz);
2377
2378             fjx1             = _mm_add_ps(fjx1,tx);
2379             fjy1             = _mm_add_ps(fjy1,ty);
2380             fjz1             = _mm_add_ps(fjz1,tz);
2381             
2382             /**************************
2383              * CALCULATE INTERACTIONS *
2384              **************************/
2385
2386             r22              = _mm_mul_ps(rsq22,rinv22);
2387             r22              = _mm_andnot_ps(dummy_mask,r22);
2388
2389             /* Calculate table index by multiplying r with table scale and truncate to integer */
2390             rt               = _mm_mul_ps(r22,vftabscale);
2391             vfitab           = _mm_cvttps_epi32(rt);
2392             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2393             vfitab           = _mm_slli_epi32(_mm_add_epi32(vfitab,_mm_slli_epi32(vfitab,1)),2);
2394
2395             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2396             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2397             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2398             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2399             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2400             _MM_TRANSPOSE4_PS(Y,F,G,H);
2401             Heps             = _mm_mul_ps(vfeps,H);
2402             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2403             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2404             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2405
2406             fscal            = felec;
2407
2408             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2409
2410             /* Calculate temporary vectorial force */
2411             tx               = _mm_mul_ps(fscal,dx22);
2412             ty               = _mm_mul_ps(fscal,dy22);
2413             tz               = _mm_mul_ps(fscal,dz22);
2414
2415             /* Update vectorial force */
2416             fix2             = _mm_add_ps(fix2,tx);
2417             fiy2             = _mm_add_ps(fiy2,ty);
2418             fiz2             = _mm_add_ps(fiz2,tz);
2419
2420             fjx2             = _mm_add_ps(fjx2,tx);
2421             fjy2             = _mm_add_ps(fjy2,ty);
2422             fjz2             = _mm_add_ps(fjz2,tz);
2423             
2424             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2425             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2426             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2427             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2428
2429             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2430                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2431
2432             /* Inner loop uses 382 flops */
2433         }
2434
2435         /* End of innermost loop */
2436
2437         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2438                                               f+i_coord_offset,fshift+i_shift_offset);
2439
2440         /* Increment number of inner iterations */
2441         inneriter                  += j_index_end - j_index_start;
2442
2443         /* Outer loop uses 18 flops */
2444     }
2445
2446     /* Increment number of outer iterations */
2447     outeriter        += nri;
2448
2449     /* Update outer/inner flops */
2450
2451     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*382);
2452 }