1b43a540c1021fb2ff0da4209b61f3e039620b74
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_sse2_single / nb_kernel_ElecCSTab_VdwNone_GeomW3W3_sse2_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS sse2_single kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gmx_math_x86_sse2_single.h"
50 #include "kernelutil_x86_sse2_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_single
54  * Electrostatics interaction: CubicSplineTable
55  * VdW interaction:            None
56  * Geometry:                   Water3-Water3
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_sse2_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83     real             scratch[4*DIM];
84     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     int              vdwioffset0;
86     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     int              vdwioffset1;
88     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     int              vdwioffset2;
90     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     __m128i          vfitab;
109     __m128i          ifour       = _mm_set1_epi32(4);
110     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
111     real             *vftab;
112     __m128           dummy_mask,cutoff_mask;
113     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
114     __m128           one     = _mm_set1_ps(1.0);
115     __m128           two     = _mm_set1_ps(2.0);
116     x                = xx[0];
117     f                = ff[0];
118
119     nri              = nlist->nri;
120     iinr             = nlist->iinr;
121     jindex           = nlist->jindex;
122     jjnr             = nlist->jjnr;
123     shiftidx         = nlist->shift;
124     gid              = nlist->gid;
125     shiftvec         = fr->shift_vec[0];
126     fshift           = fr->fshift[0];
127     facel            = _mm_set1_ps(fr->epsfac);
128     charge           = mdatoms->chargeA;
129
130     vftab            = kernel_data->table_elec->data;
131     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
132
133     /* Setup water-specific parameters */
134     inr              = nlist->iinr[0];
135     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
136     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
137     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
138
139     jq0              = _mm_set1_ps(charge[inr+0]);
140     jq1              = _mm_set1_ps(charge[inr+1]);
141     jq2              = _mm_set1_ps(charge[inr+2]);
142     qq00             = _mm_mul_ps(iq0,jq0);
143     qq01             = _mm_mul_ps(iq0,jq1);
144     qq02             = _mm_mul_ps(iq0,jq2);
145     qq10             = _mm_mul_ps(iq1,jq0);
146     qq11             = _mm_mul_ps(iq1,jq1);
147     qq12             = _mm_mul_ps(iq1,jq2);
148     qq20             = _mm_mul_ps(iq2,jq0);
149     qq21             = _mm_mul_ps(iq2,jq1);
150     qq22             = _mm_mul_ps(iq2,jq2);
151
152     /* Avoid stupid compiler warnings */
153     jnrA = jnrB = jnrC = jnrD = 0;
154     j_coord_offsetA = 0;
155     j_coord_offsetB = 0;
156     j_coord_offsetC = 0;
157     j_coord_offsetD = 0;
158
159     outeriter        = 0;
160     inneriter        = 0;
161
162     for(iidx=0;iidx<4*DIM;iidx++)
163     {
164         scratch[iidx] = 0.0;
165     }  
166
167     /* Start outer loop over neighborlists */
168     for(iidx=0; iidx<nri; iidx++)
169     {
170         /* Load shift vector for this list */
171         i_shift_offset   = DIM*shiftidx[iidx];
172
173         /* Load limits for loop over neighbors */
174         j_index_start    = jindex[iidx];
175         j_index_end      = jindex[iidx+1];
176
177         /* Get outer coordinate index */
178         inr              = iinr[iidx];
179         i_coord_offset   = DIM*inr;
180
181         /* Load i particle coords and add shift vector */
182         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
183                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
184         
185         fix0             = _mm_setzero_ps();
186         fiy0             = _mm_setzero_ps();
187         fiz0             = _mm_setzero_ps();
188         fix1             = _mm_setzero_ps();
189         fiy1             = _mm_setzero_ps();
190         fiz1             = _mm_setzero_ps();
191         fix2             = _mm_setzero_ps();
192         fiy2             = _mm_setzero_ps();
193         fiz2             = _mm_setzero_ps();
194
195         /* Reset potential sums */
196         velecsum         = _mm_setzero_ps();
197
198         /* Start inner kernel loop */
199         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
200         {
201
202             /* Get j neighbor index, and coordinate index */
203             jnrA             = jjnr[jidx];
204             jnrB             = jjnr[jidx+1];
205             jnrC             = jjnr[jidx+2];
206             jnrD             = jjnr[jidx+3];
207             j_coord_offsetA  = DIM*jnrA;
208             j_coord_offsetB  = DIM*jnrB;
209             j_coord_offsetC  = DIM*jnrC;
210             j_coord_offsetD  = DIM*jnrD;
211
212             /* load j atom coordinates */
213             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
214                                               x+j_coord_offsetC,x+j_coord_offsetD,
215                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
216
217             /* Calculate displacement vector */
218             dx00             = _mm_sub_ps(ix0,jx0);
219             dy00             = _mm_sub_ps(iy0,jy0);
220             dz00             = _mm_sub_ps(iz0,jz0);
221             dx01             = _mm_sub_ps(ix0,jx1);
222             dy01             = _mm_sub_ps(iy0,jy1);
223             dz01             = _mm_sub_ps(iz0,jz1);
224             dx02             = _mm_sub_ps(ix0,jx2);
225             dy02             = _mm_sub_ps(iy0,jy2);
226             dz02             = _mm_sub_ps(iz0,jz2);
227             dx10             = _mm_sub_ps(ix1,jx0);
228             dy10             = _mm_sub_ps(iy1,jy0);
229             dz10             = _mm_sub_ps(iz1,jz0);
230             dx11             = _mm_sub_ps(ix1,jx1);
231             dy11             = _mm_sub_ps(iy1,jy1);
232             dz11             = _mm_sub_ps(iz1,jz1);
233             dx12             = _mm_sub_ps(ix1,jx2);
234             dy12             = _mm_sub_ps(iy1,jy2);
235             dz12             = _mm_sub_ps(iz1,jz2);
236             dx20             = _mm_sub_ps(ix2,jx0);
237             dy20             = _mm_sub_ps(iy2,jy0);
238             dz20             = _mm_sub_ps(iz2,jz0);
239             dx21             = _mm_sub_ps(ix2,jx1);
240             dy21             = _mm_sub_ps(iy2,jy1);
241             dz21             = _mm_sub_ps(iz2,jz1);
242             dx22             = _mm_sub_ps(ix2,jx2);
243             dy22             = _mm_sub_ps(iy2,jy2);
244             dz22             = _mm_sub_ps(iz2,jz2);
245
246             /* Calculate squared distance and things based on it */
247             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
248             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
249             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
250             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
251             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
252             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
253             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
254             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
255             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
256
257             rinv00           = gmx_mm_invsqrt_ps(rsq00);
258             rinv01           = gmx_mm_invsqrt_ps(rsq01);
259             rinv02           = gmx_mm_invsqrt_ps(rsq02);
260             rinv10           = gmx_mm_invsqrt_ps(rsq10);
261             rinv11           = gmx_mm_invsqrt_ps(rsq11);
262             rinv12           = gmx_mm_invsqrt_ps(rsq12);
263             rinv20           = gmx_mm_invsqrt_ps(rsq20);
264             rinv21           = gmx_mm_invsqrt_ps(rsq21);
265             rinv22           = gmx_mm_invsqrt_ps(rsq22);
266
267             fjx0             = _mm_setzero_ps();
268             fjy0             = _mm_setzero_ps();
269             fjz0             = _mm_setzero_ps();
270             fjx1             = _mm_setzero_ps();
271             fjy1             = _mm_setzero_ps();
272             fjz1             = _mm_setzero_ps();
273             fjx2             = _mm_setzero_ps();
274             fjy2             = _mm_setzero_ps();
275             fjz2             = _mm_setzero_ps();
276
277             /**************************
278              * CALCULATE INTERACTIONS *
279              **************************/
280
281             r00              = _mm_mul_ps(rsq00,rinv00);
282
283             /* Calculate table index by multiplying r with table scale and truncate to integer */
284             rt               = _mm_mul_ps(r00,vftabscale);
285             vfitab           = _mm_cvttps_epi32(rt);
286             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
287             vfitab           = _mm_slli_epi32(vfitab,2);
288
289             /* CUBIC SPLINE TABLE ELECTROSTATICS */
290             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
291             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
292             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
293             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
294             _MM_TRANSPOSE4_PS(Y,F,G,H);
295             Heps             = _mm_mul_ps(vfeps,H);
296             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
297             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
298             velec            = _mm_mul_ps(qq00,VV);
299             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
300             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
301
302             /* Update potential sum for this i atom from the interaction with this j atom. */
303             velecsum         = _mm_add_ps(velecsum,velec);
304
305             fscal            = felec;
306
307             /* Calculate temporary vectorial force */
308             tx               = _mm_mul_ps(fscal,dx00);
309             ty               = _mm_mul_ps(fscal,dy00);
310             tz               = _mm_mul_ps(fscal,dz00);
311
312             /* Update vectorial force */
313             fix0             = _mm_add_ps(fix0,tx);
314             fiy0             = _mm_add_ps(fiy0,ty);
315             fiz0             = _mm_add_ps(fiz0,tz);
316
317             fjx0             = _mm_add_ps(fjx0,tx);
318             fjy0             = _mm_add_ps(fjy0,ty);
319             fjz0             = _mm_add_ps(fjz0,tz);
320             
321             /**************************
322              * CALCULATE INTERACTIONS *
323              **************************/
324
325             r01              = _mm_mul_ps(rsq01,rinv01);
326
327             /* Calculate table index by multiplying r with table scale and truncate to integer */
328             rt               = _mm_mul_ps(r01,vftabscale);
329             vfitab           = _mm_cvttps_epi32(rt);
330             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
331             vfitab           = _mm_slli_epi32(vfitab,2);
332
333             /* CUBIC SPLINE TABLE ELECTROSTATICS */
334             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
335             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
336             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
337             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
338             _MM_TRANSPOSE4_PS(Y,F,G,H);
339             Heps             = _mm_mul_ps(vfeps,H);
340             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
341             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
342             velec            = _mm_mul_ps(qq01,VV);
343             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
344             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
345
346             /* Update potential sum for this i atom from the interaction with this j atom. */
347             velecsum         = _mm_add_ps(velecsum,velec);
348
349             fscal            = felec;
350
351             /* Calculate temporary vectorial force */
352             tx               = _mm_mul_ps(fscal,dx01);
353             ty               = _mm_mul_ps(fscal,dy01);
354             tz               = _mm_mul_ps(fscal,dz01);
355
356             /* Update vectorial force */
357             fix0             = _mm_add_ps(fix0,tx);
358             fiy0             = _mm_add_ps(fiy0,ty);
359             fiz0             = _mm_add_ps(fiz0,tz);
360
361             fjx1             = _mm_add_ps(fjx1,tx);
362             fjy1             = _mm_add_ps(fjy1,ty);
363             fjz1             = _mm_add_ps(fjz1,tz);
364             
365             /**************************
366              * CALCULATE INTERACTIONS *
367              **************************/
368
369             r02              = _mm_mul_ps(rsq02,rinv02);
370
371             /* Calculate table index by multiplying r with table scale and truncate to integer */
372             rt               = _mm_mul_ps(r02,vftabscale);
373             vfitab           = _mm_cvttps_epi32(rt);
374             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
375             vfitab           = _mm_slli_epi32(vfitab,2);
376
377             /* CUBIC SPLINE TABLE ELECTROSTATICS */
378             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
379             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
380             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
381             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
382             _MM_TRANSPOSE4_PS(Y,F,G,H);
383             Heps             = _mm_mul_ps(vfeps,H);
384             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
385             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
386             velec            = _mm_mul_ps(qq02,VV);
387             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
388             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
389
390             /* Update potential sum for this i atom from the interaction with this j atom. */
391             velecsum         = _mm_add_ps(velecsum,velec);
392
393             fscal            = felec;
394
395             /* Calculate temporary vectorial force */
396             tx               = _mm_mul_ps(fscal,dx02);
397             ty               = _mm_mul_ps(fscal,dy02);
398             tz               = _mm_mul_ps(fscal,dz02);
399
400             /* Update vectorial force */
401             fix0             = _mm_add_ps(fix0,tx);
402             fiy0             = _mm_add_ps(fiy0,ty);
403             fiz0             = _mm_add_ps(fiz0,tz);
404
405             fjx2             = _mm_add_ps(fjx2,tx);
406             fjy2             = _mm_add_ps(fjy2,ty);
407             fjz2             = _mm_add_ps(fjz2,tz);
408             
409             /**************************
410              * CALCULATE INTERACTIONS *
411              **************************/
412
413             r10              = _mm_mul_ps(rsq10,rinv10);
414
415             /* Calculate table index by multiplying r with table scale and truncate to integer */
416             rt               = _mm_mul_ps(r10,vftabscale);
417             vfitab           = _mm_cvttps_epi32(rt);
418             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
419             vfitab           = _mm_slli_epi32(vfitab,2);
420
421             /* CUBIC SPLINE TABLE ELECTROSTATICS */
422             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
423             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
424             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
425             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
426             _MM_TRANSPOSE4_PS(Y,F,G,H);
427             Heps             = _mm_mul_ps(vfeps,H);
428             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
429             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
430             velec            = _mm_mul_ps(qq10,VV);
431             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
432             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
433
434             /* Update potential sum for this i atom from the interaction with this j atom. */
435             velecsum         = _mm_add_ps(velecsum,velec);
436
437             fscal            = felec;
438
439             /* Calculate temporary vectorial force */
440             tx               = _mm_mul_ps(fscal,dx10);
441             ty               = _mm_mul_ps(fscal,dy10);
442             tz               = _mm_mul_ps(fscal,dz10);
443
444             /* Update vectorial force */
445             fix1             = _mm_add_ps(fix1,tx);
446             fiy1             = _mm_add_ps(fiy1,ty);
447             fiz1             = _mm_add_ps(fiz1,tz);
448
449             fjx0             = _mm_add_ps(fjx0,tx);
450             fjy0             = _mm_add_ps(fjy0,ty);
451             fjz0             = _mm_add_ps(fjz0,tz);
452             
453             /**************************
454              * CALCULATE INTERACTIONS *
455              **************************/
456
457             r11              = _mm_mul_ps(rsq11,rinv11);
458
459             /* Calculate table index by multiplying r with table scale and truncate to integer */
460             rt               = _mm_mul_ps(r11,vftabscale);
461             vfitab           = _mm_cvttps_epi32(rt);
462             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
463             vfitab           = _mm_slli_epi32(vfitab,2);
464
465             /* CUBIC SPLINE TABLE ELECTROSTATICS */
466             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
467             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
468             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
469             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
470             _MM_TRANSPOSE4_PS(Y,F,G,H);
471             Heps             = _mm_mul_ps(vfeps,H);
472             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
473             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
474             velec            = _mm_mul_ps(qq11,VV);
475             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
476             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
477
478             /* Update potential sum for this i atom from the interaction with this j atom. */
479             velecsum         = _mm_add_ps(velecsum,velec);
480
481             fscal            = felec;
482
483             /* Calculate temporary vectorial force */
484             tx               = _mm_mul_ps(fscal,dx11);
485             ty               = _mm_mul_ps(fscal,dy11);
486             tz               = _mm_mul_ps(fscal,dz11);
487
488             /* Update vectorial force */
489             fix1             = _mm_add_ps(fix1,tx);
490             fiy1             = _mm_add_ps(fiy1,ty);
491             fiz1             = _mm_add_ps(fiz1,tz);
492
493             fjx1             = _mm_add_ps(fjx1,tx);
494             fjy1             = _mm_add_ps(fjy1,ty);
495             fjz1             = _mm_add_ps(fjz1,tz);
496             
497             /**************************
498              * CALCULATE INTERACTIONS *
499              **************************/
500
501             r12              = _mm_mul_ps(rsq12,rinv12);
502
503             /* Calculate table index by multiplying r with table scale and truncate to integer */
504             rt               = _mm_mul_ps(r12,vftabscale);
505             vfitab           = _mm_cvttps_epi32(rt);
506             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
507             vfitab           = _mm_slli_epi32(vfitab,2);
508
509             /* CUBIC SPLINE TABLE ELECTROSTATICS */
510             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
511             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
512             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
513             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
514             _MM_TRANSPOSE4_PS(Y,F,G,H);
515             Heps             = _mm_mul_ps(vfeps,H);
516             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
517             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
518             velec            = _mm_mul_ps(qq12,VV);
519             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
520             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
521
522             /* Update potential sum for this i atom from the interaction with this j atom. */
523             velecsum         = _mm_add_ps(velecsum,velec);
524
525             fscal            = felec;
526
527             /* Calculate temporary vectorial force */
528             tx               = _mm_mul_ps(fscal,dx12);
529             ty               = _mm_mul_ps(fscal,dy12);
530             tz               = _mm_mul_ps(fscal,dz12);
531
532             /* Update vectorial force */
533             fix1             = _mm_add_ps(fix1,tx);
534             fiy1             = _mm_add_ps(fiy1,ty);
535             fiz1             = _mm_add_ps(fiz1,tz);
536
537             fjx2             = _mm_add_ps(fjx2,tx);
538             fjy2             = _mm_add_ps(fjy2,ty);
539             fjz2             = _mm_add_ps(fjz2,tz);
540             
541             /**************************
542              * CALCULATE INTERACTIONS *
543              **************************/
544
545             r20              = _mm_mul_ps(rsq20,rinv20);
546
547             /* Calculate table index by multiplying r with table scale and truncate to integer */
548             rt               = _mm_mul_ps(r20,vftabscale);
549             vfitab           = _mm_cvttps_epi32(rt);
550             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
551             vfitab           = _mm_slli_epi32(vfitab,2);
552
553             /* CUBIC SPLINE TABLE ELECTROSTATICS */
554             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
555             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
556             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
557             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
558             _MM_TRANSPOSE4_PS(Y,F,G,H);
559             Heps             = _mm_mul_ps(vfeps,H);
560             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
561             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
562             velec            = _mm_mul_ps(qq20,VV);
563             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
564             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
565
566             /* Update potential sum for this i atom from the interaction with this j atom. */
567             velecsum         = _mm_add_ps(velecsum,velec);
568
569             fscal            = felec;
570
571             /* Calculate temporary vectorial force */
572             tx               = _mm_mul_ps(fscal,dx20);
573             ty               = _mm_mul_ps(fscal,dy20);
574             tz               = _mm_mul_ps(fscal,dz20);
575
576             /* Update vectorial force */
577             fix2             = _mm_add_ps(fix2,tx);
578             fiy2             = _mm_add_ps(fiy2,ty);
579             fiz2             = _mm_add_ps(fiz2,tz);
580
581             fjx0             = _mm_add_ps(fjx0,tx);
582             fjy0             = _mm_add_ps(fjy0,ty);
583             fjz0             = _mm_add_ps(fjz0,tz);
584             
585             /**************************
586              * CALCULATE INTERACTIONS *
587              **************************/
588
589             r21              = _mm_mul_ps(rsq21,rinv21);
590
591             /* Calculate table index by multiplying r with table scale and truncate to integer */
592             rt               = _mm_mul_ps(r21,vftabscale);
593             vfitab           = _mm_cvttps_epi32(rt);
594             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
595             vfitab           = _mm_slli_epi32(vfitab,2);
596
597             /* CUBIC SPLINE TABLE ELECTROSTATICS */
598             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
599             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
600             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
601             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
602             _MM_TRANSPOSE4_PS(Y,F,G,H);
603             Heps             = _mm_mul_ps(vfeps,H);
604             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
605             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
606             velec            = _mm_mul_ps(qq21,VV);
607             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
608             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
609
610             /* Update potential sum for this i atom from the interaction with this j atom. */
611             velecsum         = _mm_add_ps(velecsum,velec);
612
613             fscal            = felec;
614
615             /* Calculate temporary vectorial force */
616             tx               = _mm_mul_ps(fscal,dx21);
617             ty               = _mm_mul_ps(fscal,dy21);
618             tz               = _mm_mul_ps(fscal,dz21);
619
620             /* Update vectorial force */
621             fix2             = _mm_add_ps(fix2,tx);
622             fiy2             = _mm_add_ps(fiy2,ty);
623             fiz2             = _mm_add_ps(fiz2,tz);
624
625             fjx1             = _mm_add_ps(fjx1,tx);
626             fjy1             = _mm_add_ps(fjy1,ty);
627             fjz1             = _mm_add_ps(fjz1,tz);
628             
629             /**************************
630              * CALCULATE INTERACTIONS *
631              **************************/
632
633             r22              = _mm_mul_ps(rsq22,rinv22);
634
635             /* Calculate table index by multiplying r with table scale and truncate to integer */
636             rt               = _mm_mul_ps(r22,vftabscale);
637             vfitab           = _mm_cvttps_epi32(rt);
638             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
639             vfitab           = _mm_slli_epi32(vfitab,2);
640
641             /* CUBIC SPLINE TABLE ELECTROSTATICS */
642             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
643             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
644             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
645             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
646             _MM_TRANSPOSE4_PS(Y,F,G,H);
647             Heps             = _mm_mul_ps(vfeps,H);
648             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
649             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
650             velec            = _mm_mul_ps(qq22,VV);
651             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
652             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
653
654             /* Update potential sum for this i atom from the interaction with this j atom. */
655             velecsum         = _mm_add_ps(velecsum,velec);
656
657             fscal            = felec;
658
659             /* Calculate temporary vectorial force */
660             tx               = _mm_mul_ps(fscal,dx22);
661             ty               = _mm_mul_ps(fscal,dy22);
662             tz               = _mm_mul_ps(fscal,dz22);
663
664             /* Update vectorial force */
665             fix2             = _mm_add_ps(fix2,tx);
666             fiy2             = _mm_add_ps(fiy2,ty);
667             fiz2             = _mm_add_ps(fiz2,tz);
668
669             fjx2             = _mm_add_ps(fjx2,tx);
670             fjy2             = _mm_add_ps(fjy2,ty);
671             fjz2             = _mm_add_ps(fjz2,tz);
672             
673             fjptrA             = f+j_coord_offsetA;
674             fjptrB             = f+j_coord_offsetB;
675             fjptrC             = f+j_coord_offsetC;
676             fjptrD             = f+j_coord_offsetD;
677
678             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
679                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
680
681             /* Inner loop uses 387 flops */
682         }
683
684         if(jidx<j_index_end)
685         {
686
687             /* Get j neighbor index, and coordinate index */
688             jnrlistA         = jjnr[jidx];
689             jnrlistB         = jjnr[jidx+1];
690             jnrlistC         = jjnr[jidx+2];
691             jnrlistD         = jjnr[jidx+3];
692             /* Sign of each element will be negative for non-real atoms.
693              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
694              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
695              */
696             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
697             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
698             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
699             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
700             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
701             j_coord_offsetA  = DIM*jnrA;
702             j_coord_offsetB  = DIM*jnrB;
703             j_coord_offsetC  = DIM*jnrC;
704             j_coord_offsetD  = DIM*jnrD;
705
706             /* load j atom coordinates */
707             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
708                                               x+j_coord_offsetC,x+j_coord_offsetD,
709                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
710
711             /* Calculate displacement vector */
712             dx00             = _mm_sub_ps(ix0,jx0);
713             dy00             = _mm_sub_ps(iy0,jy0);
714             dz00             = _mm_sub_ps(iz0,jz0);
715             dx01             = _mm_sub_ps(ix0,jx1);
716             dy01             = _mm_sub_ps(iy0,jy1);
717             dz01             = _mm_sub_ps(iz0,jz1);
718             dx02             = _mm_sub_ps(ix0,jx2);
719             dy02             = _mm_sub_ps(iy0,jy2);
720             dz02             = _mm_sub_ps(iz0,jz2);
721             dx10             = _mm_sub_ps(ix1,jx0);
722             dy10             = _mm_sub_ps(iy1,jy0);
723             dz10             = _mm_sub_ps(iz1,jz0);
724             dx11             = _mm_sub_ps(ix1,jx1);
725             dy11             = _mm_sub_ps(iy1,jy1);
726             dz11             = _mm_sub_ps(iz1,jz1);
727             dx12             = _mm_sub_ps(ix1,jx2);
728             dy12             = _mm_sub_ps(iy1,jy2);
729             dz12             = _mm_sub_ps(iz1,jz2);
730             dx20             = _mm_sub_ps(ix2,jx0);
731             dy20             = _mm_sub_ps(iy2,jy0);
732             dz20             = _mm_sub_ps(iz2,jz0);
733             dx21             = _mm_sub_ps(ix2,jx1);
734             dy21             = _mm_sub_ps(iy2,jy1);
735             dz21             = _mm_sub_ps(iz2,jz1);
736             dx22             = _mm_sub_ps(ix2,jx2);
737             dy22             = _mm_sub_ps(iy2,jy2);
738             dz22             = _mm_sub_ps(iz2,jz2);
739
740             /* Calculate squared distance and things based on it */
741             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
742             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
743             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
744             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
745             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
746             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
747             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
748             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
749             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
750
751             rinv00           = gmx_mm_invsqrt_ps(rsq00);
752             rinv01           = gmx_mm_invsqrt_ps(rsq01);
753             rinv02           = gmx_mm_invsqrt_ps(rsq02);
754             rinv10           = gmx_mm_invsqrt_ps(rsq10);
755             rinv11           = gmx_mm_invsqrt_ps(rsq11);
756             rinv12           = gmx_mm_invsqrt_ps(rsq12);
757             rinv20           = gmx_mm_invsqrt_ps(rsq20);
758             rinv21           = gmx_mm_invsqrt_ps(rsq21);
759             rinv22           = gmx_mm_invsqrt_ps(rsq22);
760
761             fjx0             = _mm_setzero_ps();
762             fjy0             = _mm_setzero_ps();
763             fjz0             = _mm_setzero_ps();
764             fjx1             = _mm_setzero_ps();
765             fjy1             = _mm_setzero_ps();
766             fjz1             = _mm_setzero_ps();
767             fjx2             = _mm_setzero_ps();
768             fjy2             = _mm_setzero_ps();
769             fjz2             = _mm_setzero_ps();
770
771             /**************************
772              * CALCULATE INTERACTIONS *
773              **************************/
774
775             r00              = _mm_mul_ps(rsq00,rinv00);
776             r00              = _mm_andnot_ps(dummy_mask,r00);
777
778             /* Calculate table index by multiplying r with table scale and truncate to integer */
779             rt               = _mm_mul_ps(r00,vftabscale);
780             vfitab           = _mm_cvttps_epi32(rt);
781             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
782             vfitab           = _mm_slli_epi32(vfitab,2);
783
784             /* CUBIC SPLINE TABLE ELECTROSTATICS */
785             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
786             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
787             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
788             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
789             _MM_TRANSPOSE4_PS(Y,F,G,H);
790             Heps             = _mm_mul_ps(vfeps,H);
791             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
792             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
793             velec            = _mm_mul_ps(qq00,VV);
794             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
795             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
796
797             /* Update potential sum for this i atom from the interaction with this j atom. */
798             velec            = _mm_andnot_ps(dummy_mask,velec);
799             velecsum         = _mm_add_ps(velecsum,velec);
800
801             fscal            = felec;
802
803             fscal            = _mm_andnot_ps(dummy_mask,fscal);
804
805             /* Calculate temporary vectorial force */
806             tx               = _mm_mul_ps(fscal,dx00);
807             ty               = _mm_mul_ps(fscal,dy00);
808             tz               = _mm_mul_ps(fscal,dz00);
809
810             /* Update vectorial force */
811             fix0             = _mm_add_ps(fix0,tx);
812             fiy0             = _mm_add_ps(fiy0,ty);
813             fiz0             = _mm_add_ps(fiz0,tz);
814
815             fjx0             = _mm_add_ps(fjx0,tx);
816             fjy0             = _mm_add_ps(fjy0,ty);
817             fjz0             = _mm_add_ps(fjz0,tz);
818             
819             /**************************
820              * CALCULATE INTERACTIONS *
821              **************************/
822
823             r01              = _mm_mul_ps(rsq01,rinv01);
824             r01              = _mm_andnot_ps(dummy_mask,r01);
825
826             /* Calculate table index by multiplying r with table scale and truncate to integer */
827             rt               = _mm_mul_ps(r01,vftabscale);
828             vfitab           = _mm_cvttps_epi32(rt);
829             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
830             vfitab           = _mm_slli_epi32(vfitab,2);
831
832             /* CUBIC SPLINE TABLE ELECTROSTATICS */
833             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
834             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
835             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
836             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
837             _MM_TRANSPOSE4_PS(Y,F,G,H);
838             Heps             = _mm_mul_ps(vfeps,H);
839             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
840             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
841             velec            = _mm_mul_ps(qq01,VV);
842             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
843             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
844
845             /* Update potential sum for this i atom from the interaction with this j atom. */
846             velec            = _mm_andnot_ps(dummy_mask,velec);
847             velecsum         = _mm_add_ps(velecsum,velec);
848
849             fscal            = felec;
850
851             fscal            = _mm_andnot_ps(dummy_mask,fscal);
852
853             /* Calculate temporary vectorial force */
854             tx               = _mm_mul_ps(fscal,dx01);
855             ty               = _mm_mul_ps(fscal,dy01);
856             tz               = _mm_mul_ps(fscal,dz01);
857
858             /* Update vectorial force */
859             fix0             = _mm_add_ps(fix0,tx);
860             fiy0             = _mm_add_ps(fiy0,ty);
861             fiz0             = _mm_add_ps(fiz0,tz);
862
863             fjx1             = _mm_add_ps(fjx1,tx);
864             fjy1             = _mm_add_ps(fjy1,ty);
865             fjz1             = _mm_add_ps(fjz1,tz);
866             
867             /**************************
868              * CALCULATE INTERACTIONS *
869              **************************/
870
871             r02              = _mm_mul_ps(rsq02,rinv02);
872             r02              = _mm_andnot_ps(dummy_mask,r02);
873
874             /* Calculate table index by multiplying r with table scale and truncate to integer */
875             rt               = _mm_mul_ps(r02,vftabscale);
876             vfitab           = _mm_cvttps_epi32(rt);
877             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
878             vfitab           = _mm_slli_epi32(vfitab,2);
879
880             /* CUBIC SPLINE TABLE ELECTROSTATICS */
881             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
882             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
883             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
884             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
885             _MM_TRANSPOSE4_PS(Y,F,G,H);
886             Heps             = _mm_mul_ps(vfeps,H);
887             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
888             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
889             velec            = _mm_mul_ps(qq02,VV);
890             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
891             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
892
893             /* Update potential sum for this i atom from the interaction with this j atom. */
894             velec            = _mm_andnot_ps(dummy_mask,velec);
895             velecsum         = _mm_add_ps(velecsum,velec);
896
897             fscal            = felec;
898
899             fscal            = _mm_andnot_ps(dummy_mask,fscal);
900
901             /* Calculate temporary vectorial force */
902             tx               = _mm_mul_ps(fscal,dx02);
903             ty               = _mm_mul_ps(fscal,dy02);
904             tz               = _mm_mul_ps(fscal,dz02);
905
906             /* Update vectorial force */
907             fix0             = _mm_add_ps(fix0,tx);
908             fiy0             = _mm_add_ps(fiy0,ty);
909             fiz0             = _mm_add_ps(fiz0,tz);
910
911             fjx2             = _mm_add_ps(fjx2,tx);
912             fjy2             = _mm_add_ps(fjy2,ty);
913             fjz2             = _mm_add_ps(fjz2,tz);
914             
915             /**************************
916              * CALCULATE INTERACTIONS *
917              **************************/
918
919             r10              = _mm_mul_ps(rsq10,rinv10);
920             r10              = _mm_andnot_ps(dummy_mask,r10);
921
922             /* Calculate table index by multiplying r with table scale and truncate to integer */
923             rt               = _mm_mul_ps(r10,vftabscale);
924             vfitab           = _mm_cvttps_epi32(rt);
925             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
926             vfitab           = _mm_slli_epi32(vfitab,2);
927
928             /* CUBIC SPLINE TABLE ELECTROSTATICS */
929             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
930             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
931             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
932             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
933             _MM_TRANSPOSE4_PS(Y,F,G,H);
934             Heps             = _mm_mul_ps(vfeps,H);
935             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
936             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
937             velec            = _mm_mul_ps(qq10,VV);
938             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
939             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
940
941             /* Update potential sum for this i atom from the interaction with this j atom. */
942             velec            = _mm_andnot_ps(dummy_mask,velec);
943             velecsum         = _mm_add_ps(velecsum,velec);
944
945             fscal            = felec;
946
947             fscal            = _mm_andnot_ps(dummy_mask,fscal);
948
949             /* Calculate temporary vectorial force */
950             tx               = _mm_mul_ps(fscal,dx10);
951             ty               = _mm_mul_ps(fscal,dy10);
952             tz               = _mm_mul_ps(fscal,dz10);
953
954             /* Update vectorial force */
955             fix1             = _mm_add_ps(fix1,tx);
956             fiy1             = _mm_add_ps(fiy1,ty);
957             fiz1             = _mm_add_ps(fiz1,tz);
958
959             fjx0             = _mm_add_ps(fjx0,tx);
960             fjy0             = _mm_add_ps(fjy0,ty);
961             fjz0             = _mm_add_ps(fjz0,tz);
962             
963             /**************************
964              * CALCULATE INTERACTIONS *
965              **************************/
966
967             r11              = _mm_mul_ps(rsq11,rinv11);
968             r11              = _mm_andnot_ps(dummy_mask,r11);
969
970             /* Calculate table index by multiplying r with table scale and truncate to integer */
971             rt               = _mm_mul_ps(r11,vftabscale);
972             vfitab           = _mm_cvttps_epi32(rt);
973             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
974             vfitab           = _mm_slli_epi32(vfitab,2);
975
976             /* CUBIC SPLINE TABLE ELECTROSTATICS */
977             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
978             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
979             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
980             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
981             _MM_TRANSPOSE4_PS(Y,F,G,H);
982             Heps             = _mm_mul_ps(vfeps,H);
983             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
984             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
985             velec            = _mm_mul_ps(qq11,VV);
986             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
987             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
988
989             /* Update potential sum for this i atom from the interaction with this j atom. */
990             velec            = _mm_andnot_ps(dummy_mask,velec);
991             velecsum         = _mm_add_ps(velecsum,velec);
992
993             fscal            = felec;
994
995             fscal            = _mm_andnot_ps(dummy_mask,fscal);
996
997             /* Calculate temporary vectorial force */
998             tx               = _mm_mul_ps(fscal,dx11);
999             ty               = _mm_mul_ps(fscal,dy11);
1000             tz               = _mm_mul_ps(fscal,dz11);
1001
1002             /* Update vectorial force */
1003             fix1             = _mm_add_ps(fix1,tx);
1004             fiy1             = _mm_add_ps(fiy1,ty);
1005             fiz1             = _mm_add_ps(fiz1,tz);
1006
1007             fjx1             = _mm_add_ps(fjx1,tx);
1008             fjy1             = _mm_add_ps(fjy1,ty);
1009             fjz1             = _mm_add_ps(fjz1,tz);
1010             
1011             /**************************
1012              * CALCULATE INTERACTIONS *
1013              **************************/
1014
1015             r12              = _mm_mul_ps(rsq12,rinv12);
1016             r12              = _mm_andnot_ps(dummy_mask,r12);
1017
1018             /* Calculate table index by multiplying r with table scale and truncate to integer */
1019             rt               = _mm_mul_ps(r12,vftabscale);
1020             vfitab           = _mm_cvttps_epi32(rt);
1021             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1022             vfitab           = _mm_slli_epi32(vfitab,2);
1023
1024             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1025             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1026             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1027             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1028             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1029             _MM_TRANSPOSE4_PS(Y,F,G,H);
1030             Heps             = _mm_mul_ps(vfeps,H);
1031             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1032             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1033             velec            = _mm_mul_ps(qq12,VV);
1034             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1035             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1036
1037             /* Update potential sum for this i atom from the interaction with this j atom. */
1038             velec            = _mm_andnot_ps(dummy_mask,velec);
1039             velecsum         = _mm_add_ps(velecsum,velec);
1040
1041             fscal            = felec;
1042
1043             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1044
1045             /* Calculate temporary vectorial force */
1046             tx               = _mm_mul_ps(fscal,dx12);
1047             ty               = _mm_mul_ps(fscal,dy12);
1048             tz               = _mm_mul_ps(fscal,dz12);
1049
1050             /* Update vectorial force */
1051             fix1             = _mm_add_ps(fix1,tx);
1052             fiy1             = _mm_add_ps(fiy1,ty);
1053             fiz1             = _mm_add_ps(fiz1,tz);
1054
1055             fjx2             = _mm_add_ps(fjx2,tx);
1056             fjy2             = _mm_add_ps(fjy2,ty);
1057             fjz2             = _mm_add_ps(fjz2,tz);
1058             
1059             /**************************
1060              * CALCULATE INTERACTIONS *
1061              **************************/
1062
1063             r20              = _mm_mul_ps(rsq20,rinv20);
1064             r20              = _mm_andnot_ps(dummy_mask,r20);
1065
1066             /* Calculate table index by multiplying r with table scale and truncate to integer */
1067             rt               = _mm_mul_ps(r20,vftabscale);
1068             vfitab           = _mm_cvttps_epi32(rt);
1069             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1070             vfitab           = _mm_slli_epi32(vfitab,2);
1071
1072             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1073             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1074             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1075             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1076             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1077             _MM_TRANSPOSE4_PS(Y,F,G,H);
1078             Heps             = _mm_mul_ps(vfeps,H);
1079             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1080             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1081             velec            = _mm_mul_ps(qq20,VV);
1082             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1083             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1084
1085             /* Update potential sum for this i atom from the interaction with this j atom. */
1086             velec            = _mm_andnot_ps(dummy_mask,velec);
1087             velecsum         = _mm_add_ps(velecsum,velec);
1088
1089             fscal            = felec;
1090
1091             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1092
1093             /* Calculate temporary vectorial force */
1094             tx               = _mm_mul_ps(fscal,dx20);
1095             ty               = _mm_mul_ps(fscal,dy20);
1096             tz               = _mm_mul_ps(fscal,dz20);
1097
1098             /* Update vectorial force */
1099             fix2             = _mm_add_ps(fix2,tx);
1100             fiy2             = _mm_add_ps(fiy2,ty);
1101             fiz2             = _mm_add_ps(fiz2,tz);
1102
1103             fjx0             = _mm_add_ps(fjx0,tx);
1104             fjy0             = _mm_add_ps(fjy0,ty);
1105             fjz0             = _mm_add_ps(fjz0,tz);
1106             
1107             /**************************
1108              * CALCULATE INTERACTIONS *
1109              **************************/
1110
1111             r21              = _mm_mul_ps(rsq21,rinv21);
1112             r21              = _mm_andnot_ps(dummy_mask,r21);
1113
1114             /* Calculate table index by multiplying r with table scale and truncate to integer */
1115             rt               = _mm_mul_ps(r21,vftabscale);
1116             vfitab           = _mm_cvttps_epi32(rt);
1117             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1118             vfitab           = _mm_slli_epi32(vfitab,2);
1119
1120             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1121             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1122             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1123             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1124             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1125             _MM_TRANSPOSE4_PS(Y,F,G,H);
1126             Heps             = _mm_mul_ps(vfeps,H);
1127             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1128             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1129             velec            = _mm_mul_ps(qq21,VV);
1130             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1131             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1132
1133             /* Update potential sum for this i atom from the interaction with this j atom. */
1134             velec            = _mm_andnot_ps(dummy_mask,velec);
1135             velecsum         = _mm_add_ps(velecsum,velec);
1136
1137             fscal            = felec;
1138
1139             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1140
1141             /* Calculate temporary vectorial force */
1142             tx               = _mm_mul_ps(fscal,dx21);
1143             ty               = _mm_mul_ps(fscal,dy21);
1144             tz               = _mm_mul_ps(fscal,dz21);
1145
1146             /* Update vectorial force */
1147             fix2             = _mm_add_ps(fix2,tx);
1148             fiy2             = _mm_add_ps(fiy2,ty);
1149             fiz2             = _mm_add_ps(fiz2,tz);
1150
1151             fjx1             = _mm_add_ps(fjx1,tx);
1152             fjy1             = _mm_add_ps(fjy1,ty);
1153             fjz1             = _mm_add_ps(fjz1,tz);
1154             
1155             /**************************
1156              * CALCULATE INTERACTIONS *
1157              **************************/
1158
1159             r22              = _mm_mul_ps(rsq22,rinv22);
1160             r22              = _mm_andnot_ps(dummy_mask,r22);
1161
1162             /* Calculate table index by multiplying r with table scale and truncate to integer */
1163             rt               = _mm_mul_ps(r22,vftabscale);
1164             vfitab           = _mm_cvttps_epi32(rt);
1165             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1166             vfitab           = _mm_slli_epi32(vfitab,2);
1167
1168             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1169             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1170             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1171             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1172             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1173             _MM_TRANSPOSE4_PS(Y,F,G,H);
1174             Heps             = _mm_mul_ps(vfeps,H);
1175             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1176             VV               = _mm_add_ps(Y,_mm_mul_ps(vfeps,Fp));
1177             velec            = _mm_mul_ps(qq22,VV);
1178             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1179             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1180
1181             /* Update potential sum for this i atom from the interaction with this j atom. */
1182             velec            = _mm_andnot_ps(dummy_mask,velec);
1183             velecsum         = _mm_add_ps(velecsum,velec);
1184
1185             fscal            = felec;
1186
1187             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1188
1189             /* Calculate temporary vectorial force */
1190             tx               = _mm_mul_ps(fscal,dx22);
1191             ty               = _mm_mul_ps(fscal,dy22);
1192             tz               = _mm_mul_ps(fscal,dz22);
1193
1194             /* Update vectorial force */
1195             fix2             = _mm_add_ps(fix2,tx);
1196             fiy2             = _mm_add_ps(fiy2,ty);
1197             fiz2             = _mm_add_ps(fiz2,tz);
1198
1199             fjx2             = _mm_add_ps(fjx2,tx);
1200             fjy2             = _mm_add_ps(fjy2,ty);
1201             fjz2             = _mm_add_ps(fjz2,tz);
1202             
1203             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1204             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1205             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1206             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1207
1208             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1209                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1210
1211             /* Inner loop uses 396 flops */
1212         }
1213
1214         /* End of innermost loop */
1215
1216         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1217                                               f+i_coord_offset,fshift+i_shift_offset);
1218
1219         ggid                        = gid[iidx];
1220         /* Update potential energies */
1221         gmx_mm_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1222
1223         /* Increment number of inner iterations */
1224         inneriter                  += j_index_end - j_index_start;
1225
1226         /* Outer loop uses 19 flops */
1227     }
1228
1229     /* Increment number of outer iterations */
1230     outeriter        += nri;
1231
1232     /* Update outer/inner flops */
1233
1234     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*396);
1235 }
1236 /*
1237  * Gromacs nonbonded kernel:   nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_single
1238  * Electrostatics interaction: CubicSplineTable
1239  * VdW interaction:            None
1240  * Geometry:                   Water3-Water3
1241  * Calculate force/pot:        Force
1242  */
1243 void
1244 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_sse2_single
1245                     (t_nblist                    * gmx_restrict       nlist,
1246                      rvec                        * gmx_restrict          xx,
1247                      rvec                        * gmx_restrict          ff,
1248                      t_forcerec                  * gmx_restrict          fr,
1249                      t_mdatoms                   * gmx_restrict     mdatoms,
1250                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1251                      t_nrnb                      * gmx_restrict        nrnb)
1252 {
1253     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1254      * just 0 for non-waters.
1255      * Suffixes A,B,C,D refer to j loop unrolling done with SSE, e.g. for the four different
1256      * jnr indices corresponding to data put in the four positions in the SIMD register.
1257      */
1258     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1259     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1260     int              jnrA,jnrB,jnrC,jnrD;
1261     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1262     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1263     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1264     real             rcutoff_scalar;
1265     real             *shiftvec,*fshift,*x,*f;
1266     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1267     real             scratch[4*DIM];
1268     __m128           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1269     int              vdwioffset0;
1270     __m128           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1271     int              vdwioffset1;
1272     __m128           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1273     int              vdwioffset2;
1274     __m128           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1275     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1276     __m128           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1277     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1278     __m128           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1279     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1280     __m128           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1281     __m128           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1282     __m128           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1283     __m128           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1284     __m128           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1285     __m128           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1286     __m128           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1287     __m128           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1288     __m128           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1289     __m128           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1290     __m128           velec,felec,velecsum,facel,crf,krf,krf2;
1291     real             *charge;
1292     __m128i          vfitab;
1293     __m128i          ifour       = _mm_set1_epi32(4);
1294     __m128           rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1295     real             *vftab;
1296     __m128           dummy_mask,cutoff_mask;
1297     __m128           signbit = _mm_castsi128_ps( _mm_set1_epi32(0x80000000) );
1298     __m128           one     = _mm_set1_ps(1.0);
1299     __m128           two     = _mm_set1_ps(2.0);
1300     x                = xx[0];
1301     f                = ff[0];
1302
1303     nri              = nlist->nri;
1304     iinr             = nlist->iinr;
1305     jindex           = nlist->jindex;
1306     jjnr             = nlist->jjnr;
1307     shiftidx         = nlist->shift;
1308     gid              = nlist->gid;
1309     shiftvec         = fr->shift_vec[0];
1310     fshift           = fr->fshift[0];
1311     facel            = _mm_set1_ps(fr->epsfac);
1312     charge           = mdatoms->chargeA;
1313
1314     vftab            = kernel_data->table_elec->data;
1315     vftabscale       = _mm_set1_ps(kernel_data->table_elec->scale);
1316
1317     /* Setup water-specific parameters */
1318     inr              = nlist->iinr[0];
1319     iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
1320     iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
1321     iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
1322
1323     jq0              = _mm_set1_ps(charge[inr+0]);
1324     jq1              = _mm_set1_ps(charge[inr+1]);
1325     jq2              = _mm_set1_ps(charge[inr+2]);
1326     qq00             = _mm_mul_ps(iq0,jq0);
1327     qq01             = _mm_mul_ps(iq0,jq1);
1328     qq02             = _mm_mul_ps(iq0,jq2);
1329     qq10             = _mm_mul_ps(iq1,jq0);
1330     qq11             = _mm_mul_ps(iq1,jq1);
1331     qq12             = _mm_mul_ps(iq1,jq2);
1332     qq20             = _mm_mul_ps(iq2,jq0);
1333     qq21             = _mm_mul_ps(iq2,jq1);
1334     qq22             = _mm_mul_ps(iq2,jq2);
1335
1336     /* Avoid stupid compiler warnings */
1337     jnrA = jnrB = jnrC = jnrD = 0;
1338     j_coord_offsetA = 0;
1339     j_coord_offsetB = 0;
1340     j_coord_offsetC = 0;
1341     j_coord_offsetD = 0;
1342
1343     outeriter        = 0;
1344     inneriter        = 0;
1345
1346     for(iidx=0;iidx<4*DIM;iidx++)
1347     {
1348         scratch[iidx] = 0.0;
1349     }  
1350
1351     /* Start outer loop over neighborlists */
1352     for(iidx=0; iidx<nri; iidx++)
1353     {
1354         /* Load shift vector for this list */
1355         i_shift_offset   = DIM*shiftidx[iidx];
1356
1357         /* Load limits for loop over neighbors */
1358         j_index_start    = jindex[iidx];
1359         j_index_end      = jindex[iidx+1];
1360
1361         /* Get outer coordinate index */
1362         inr              = iinr[iidx];
1363         i_coord_offset   = DIM*inr;
1364
1365         /* Load i particle coords and add shift vector */
1366         gmx_mm_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1367                                                  &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1368         
1369         fix0             = _mm_setzero_ps();
1370         fiy0             = _mm_setzero_ps();
1371         fiz0             = _mm_setzero_ps();
1372         fix1             = _mm_setzero_ps();
1373         fiy1             = _mm_setzero_ps();
1374         fiz1             = _mm_setzero_ps();
1375         fix2             = _mm_setzero_ps();
1376         fiy2             = _mm_setzero_ps();
1377         fiz2             = _mm_setzero_ps();
1378
1379         /* Start inner kernel loop */
1380         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1381         {
1382
1383             /* Get j neighbor index, and coordinate index */
1384             jnrA             = jjnr[jidx];
1385             jnrB             = jjnr[jidx+1];
1386             jnrC             = jjnr[jidx+2];
1387             jnrD             = jjnr[jidx+3];
1388             j_coord_offsetA  = DIM*jnrA;
1389             j_coord_offsetB  = DIM*jnrB;
1390             j_coord_offsetC  = DIM*jnrC;
1391             j_coord_offsetD  = DIM*jnrD;
1392
1393             /* load j atom coordinates */
1394             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1395                                               x+j_coord_offsetC,x+j_coord_offsetD,
1396                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1397
1398             /* Calculate displacement vector */
1399             dx00             = _mm_sub_ps(ix0,jx0);
1400             dy00             = _mm_sub_ps(iy0,jy0);
1401             dz00             = _mm_sub_ps(iz0,jz0);
1402             dx01             = _mm_sub_ps(ix0,jx1);
1403             dy01             = _mm_sub_ps(iy0,jy1);
1404             dz01             = _mm_sub_ps(iz0,jz1);
1405             dx02             = _mm_sub_ps(ix0,jx2);
1406             dy02             = _mm_sub_ps(iy0,jy2);
1407             dz02             = _mm_sub_ps(iz0,jz2);
1408             dx10             = _mm_sub_ps(ix1,jx0);
1409             dy10             = _mm_sub_ps(iy1,jy0);
1410             dz10             = _mm_sub_ps(iz1,jz0);
1411             dx11             = _mm_sub_ps(ix1,jx1);
1412             dy11             = _mm_sub_ps(iy1,jy1);
1413             dz11             = _mm_sub_ps(iz1,jz1);
1414             dx12             = _mm_sub_ps(ix1,jx2);
1415             dy12             = _mm_sub_ps(iy1,jy2);
1416             dz12             = _mm_sub_ps(iz1,jz2);
1417             dx20             = _mm_sub_ps(ix2,jx0);
1418             dy20             = _mm_sub_ps(iy2,jy0);
1419             dz20             = _mm_sub_ps(iz2,jz0);
1420             dx21             = _mm_sub_ps(ix2,jx1);
1421             dy21             = _mm_sub_ps(iy2,jy1);
1422             dz21             = _mm_sub_ps(iz2,jz1);
1423             dx22             = _mm_sub_ps(ix2,jx2);
1424             dy22             = _mm_sub_ps(iy2,jy2);
1425             dz22             = _mm_sub_ps(iz2,jz2);
1426
1427             /* Calculate squared distance and things based on it */
1428             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1429             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1430             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1431             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1432             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1433             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1434             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1435             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1436             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1437
1438             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1439             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1440             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1441             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1442             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1443             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1444             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1445             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1446             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1447
1448             fjx0             = _mm_setzero_ps();
1449             fjy0             = _mm_setzero_ps();
1450             fjz0             = _mm_setzero_ps();
1451             fjx1             = _mm_setzero_ps();
1452             fjy1             = _mm_setzero_ps();
1453             fjz1             = _mm_setzero_ps();
1454             fjx2             = _mm_setzero_ps();
1455             fjy2             = _mm_setzero_ps();
1456             fjz2             = _mm_setzero_ps();
1457
1458             /**************************
1459              * CALCULATE INTERACTIONS *
1460              **************************/
1461
1462             r00              = _mm_mul_ps(rsq00,rinv00);
1463
1464             /* Calculate table index by multiplying r with table scale and truncate to integer */
1465             rt               = _mm_mul_ps(r00,vftabscale);
1466             vfitab           = _mm_cvttps_epi32(rt);
1467             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1468             vfitab           = _mm_slli_epi32(vfitab,2);
1469
1470             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1471             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1472             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1473             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1474             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1475             _MM_TRANSPOSE4_PS(Y,F,G,H);
1476             Heps             = _mm_mul_ps(vfeps,H);
1477             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1478             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1479             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1480
1481             fscal            = felec;
1482
1483             /* Calculate temporary vectorial force */
1484             tx               = _mm_mul_ps(fscal,dx00);
1485             ty               = _mm_mul_ps(fscal,dy00);
1486             tz               = _mm_mul_ps(fscal,dz00);
1487
1488             /* Update vectorial force */
1489             fix0             = _mm_add_ps(fix0,tx);
1490             fiy0             = _mm_add_ps(fiy0,ty);
1491             fiz0             = _mm_add_ps(fiz0,tz);
1492
1493             fjx0             = _mm_add_ps(fjx0,tx);
1494             fjy0             = _mm_add_ps(fjy0,ty);
1495             fjz0             = _mm_add_ps(fjz0,tz);
1496             
1497             /**************************
1498              * CALCULATE INTERACTIONS *
1499              **************************/
1500
1501             r01              = _mm_mul_ps(rsq01,rinv01);
1502
1503             /* Calculate table index by multiplying r with table scale and truncate to integer */
1504             rt               = _mm_mul_ps(r01,vftabscale);
1505             vfitab           = _mm_cvttps_epi32(rt);
1506             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1507             vfitab           = _mm_slli_epi32(vfitab,2);
1508
1509             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1510             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1511             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1512             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1513             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1514             _MM_TRANSPOSE4_PS(Y,F,G,H);
1515             Heps             = _mm_mul_ps(vfeps,H);
1516             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1517             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1518             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1519
1520             fscal            = felec;
1521
1522             /* Calculate temporary vectorial force */
1523             tx               = _mm_mul_ps(fscal,dx01);
1524             ty               = _mm_mul_ps(fscal,dy01);
1525             tz               = _mm_mul_ps(fscal,dz01);
1526
1527             /* Update vectorial force */
1528             fix0             = _mm_add_ps(fix0,tx);
1529             fiy0             = _mm_add_ps(fiy0,ty);
1530             fiz0             = _mm_add_ps(fiz0,tz);
1531
1532             fjx1             = _mm_add_ps(fjx1,tx);
1533             fjy1             = _mm_add_ps(fjy1,ty);
1534             fjz1             = _mm_add_ps(fjz1,tz);
1535             
1536             /**************************
1537              * CALCULATE INTERACTIONS *
1538              **************************/
1539
1540             r02              = _mm_mul_ps(rsq02,rinv02);
1541
1542             /* Calculate table index by multiplying r with table scale and truncate to integer */
1543             rt               = _mm_mul_ps(r02,vftabscale);
1544             vfitab           = _mm_cvttps_epi32(rt);
1545             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1546             vfitab           = _mm_slli_epi32(vfitab,2);
1547
1548             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1549             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1550             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1551             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1552             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1553             _MM_TRANSPOSE4_PS(Y,F,G,H);
1554             Heps             = _mm_mul_ps(vfeps,H);
1555             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1556             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1557             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
1558
1559             fscal            = felec;
1560
1561             /* Calculate temporary vectorial force */
1562             tx               = _mm_mul_ps(fscal,dx02);
1563             ty               = _mm_mul_ps(fscal,dy02);
1564             tz               = _mm_mul_ps(fscal,dz02);
1565
1566             /* Update vectorial force */
1567             fix0             = _mm_add_ps(fix0,tx);
1568             fiy0             = _mm_add_ps(fiy0,ty);
1569             fiz0             = _mm_add_ps(fiz0,tz);
1570
1571             fjx2             = _mm_add_ps(fjx2,tx);
1572             fjy2             = _mm_add_ps(fjy2,ty);
1573             fjz2             = _mm_add_ps(fjz2,tz);
1574             
1575             /**************************
1576              * CALCULATE INTERACTIONS *
1577              **************************/
1578
1579             r10              = _mm_mul_ps(rsq10,rinv10);
1580
1581             /* Calculate table index by multiplying r with table scale and truncate to integer */
1582             rt               = _mm_mul_ps(r10,vftabscale);
1583             vfitab           = _mm_cvttps_epi32(rt);
1584             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1585             vfitab           = _mm_slli_epi32(vfitab,2);
1586
1587             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1588             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1589             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1590             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1591             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1592             _MM_TRANSPOSE4_PS(Y,F,G,H);
1593             Heps             = _mm_mul_ps(vfeps,H);
1594             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1595             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1596             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
1597
1598             fscal            = felec;
1599
1600             /* Calculate temporary vectorial force */
1601             tx               = _mm_mul_ps(fscal,dx10);
1602             ty               = _mm_mul_ps(fscal,dy10);
1603             tz               = _mm_mul_ps(fscal,dz10);
1604
1605             /* Update vectorial force */
1606             fix1             = _mm_add_ps(fix1,tx);
1607             fiy1             = _mm_add_ps(fiy1,ty);
1608             fiz1             = _mm_add_ps(fiz1,tz);
1609
1610             fjx0             = _mm_add_ps(fjx0,tx);
1611             fjy0             = _mm_add_ps(fjy0,ty);
1612             fjz0             = _mm_add_ps(fjz0,tz);
1613             
1614             /**************************
1615              * CALCULATE INTERACTIONS *
1616              **************************/
1617
1618             r11              = _mm_mul_ps(rsq11,rinv11);
1619
1620             /* Calculate table index by multiplying r with table scale and truncate to integer */
1621             rt               = _mm_mul_ps(r11,vftabscale);
1622             vfitab           = _mm_cvttps_epi32(rt);
1623             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1624             vfitab           = _mm_slli_epi32(vfitab,2);
1625
1626             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1627             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1628             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1629             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1630             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1631             _MM_TRANSPOSE4_PS(Y,F,G,H);
1632             Heps             = _mm_mul_ps(vfeps,H);
1633             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1634             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1635             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
1636
1637             fscal            = felec;
1638
1639             /* Calculate temporary vectorial force */
1640             tx               = _mm_mul_ps(fscal,dx11);
1641             ty               = _mm_mul_ps(fscal,dy11);
1642             tz               = _mm_mul_ps(fscal,dz11);
1643
1644             /* Update vectorial force */
1645             fix1             = _mm_add_ps(fix1,tx);
1646             fiy1             = _mm_add_ps(fiy1,ty);
1647             fiz1             = _mm_add_ps(fiz1,tz);
1648
1649             fjx1             = _mm_add_ps(fjx1,tx);
1650             fjy1             = _mm_add_ps(fjy1,ty);
1651             fjz1             = _mm_add_ps(fjz1,tz);
1652             
1653             /**************************
1654              * CALCULATE INTERACTIONS *
1655              **************************/
1656
1657             r12              = _mm_mul_ps(rsq12,rinv12);
1658
1659             /* Calculate table index by multiplying r with table scale and truncate to integer */
1660             rt               = _mm_mul_ps(r12,vftabscale);
1661             vfitab           = _mm_cvttps_epi32(rt);
1662             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1663             vfitab           = _mm_slli_epi32(vfitab,2);
1664
1665             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1666             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1667             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1668             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1669             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1670             _MM_TRANSPOSE4_PS(Y,F,G,H);
1671             Heps             = _mm_mul_ps(vfeps,H);
1672             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1673             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1674             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
1675
1676             fscal            = felec;
1677
1678             /* Calculate temporary vectorial force */
1679             tx               = _mm_mul_ps(fscal,dx12);
1680             ty               = _mm_mul_ps(fscal,dy12);
1681             tz               = _mm_mul_ps(fscal,dz12);
1682
1683             /* Update vectorial force */
1684             fix1             = _mm_add_ps(fix1,tx);
1685             fiy1             = _mm_add_ps(fiy1,ty);
1686             fiz1             = _mm_add_ps(fiz1,tz);
1687
1688             fjx2             = _mm_add_ps(fjx2,tx);
1689             fjy2             = _mm_add_ps(fjy2,ty);
1690             fjz2             = _mm_add_ps(fjz2,tz);
1691             
1692             /**************************
1693              * CALCULATE INTERACTIONS *
1694              **************************/
1695
1696             r20              = _mm_mul_ps(rsq20,rinv20);
1697
1698             /* Calculate table index by multiplying r with table scale and truncate to integer */
1699             rt               = _mm_mul_ps(r20,vftabscale);
1700             vfitab           = _mm_cvttps_epi32(rt);
1701             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1702             vfitab           = _mm_slli_epi32(vfitab,2);
1703
1704             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1705             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1706             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1707             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1708             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1709             _MM_TRANSPOSE4_PS(Y,F,G,H);
1710             Heps             = _mm_mul_ps(vfeps,H);
1711             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1712             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1713             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
1714
1715             fscal            = felec;
1716
1717             /* Calculate temporary vectorial force */
1718             tx               = _mm_mul_ps(fscal,dx20);
1719             ty               = _mm_mul_ps(fscal,dy20);
1720             tz               = _mm_mul_ps(fscal,dz20);
1721
1722             /* Update vectorial force */
1723             fix2             = _mm_add_ps(fix2,tx);
1724             fiy2             = _mm_add_ps(fiy2,ty);
1725             fiz2             = _mm_add_ps(fiz2,tz);
1726
1727             fjx0             = _mm_add_ps(fjx0,tx);
1728             fjy0             = _mm_add_ps(fjy0,ty);
1729             fjz0             = _mm_add_ps(fjz0,tz);
1730             
1731             /**************************
1732              * CALCULATE INTERACTIONS *
1733              **************************/
1734
1735             r21              = _mm_mul_ps(rsq21,rinv21);
1736
1737             /* Calculate table index by multiplying r with table scale and truncate to integer */
1738             rt               = _mm_mul_ps(r21,vftabscale);
1739             vfitab           = _mm_cvttps_epi32(rt);
1740             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1741             vfitab           = _mm_slli_epi32(vfitab,2);
1742
1743             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1744             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1745             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1746             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1747             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1748             _MM_TRANSPOSE4_PS(Y,F,G,H);
1749             Heps             = _mm_mul_ps(vfeps,H);
1750             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1751             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1752             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
1753
1754             fscal            = felec;
1755
1756             /* Calculate temporary vectorial force */
1757             tx               = _mm_mul_ps(fscal,dx21);
1758             ty               = _mm_mul_ps(fscal,dy21);
1759             tz               = _mm_mul_ps(fscal,dz21);
1760
1761             /* Update vectorial force */
1762             fix2             = _mm_add_ps(fix2,tx);
1763             fiy2             = _mm_add_ps(fiy2,ty);
1764             fiz2             = _mm_add_ps(fiz2,tz);
1765
1766             fjx1             = _mm_add_ps(fjx1,tx);
1767             fjy1             = _mm_add_ps(fjy1,ty);
1768             fjz1             = _mm_add_ps(fjz1,tz);
1769             
1770             /**************************
1771              * CALCULATE INTERACTIONS *
1772              **************************/
1773
1774             r22              = _mm_mul_ps(rsq22,rinv22);
1775
1776             /* Calculate table index by multiplying r with table scale and truncate to integer */
1777             rt               = _mm_mul_ps(r22,vftabscale);
1778             vfitab           = _mm_cvttps_epi32(rt);
1779             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1780             vfitab           = _mm_slli_epi32(vfitab,2);
1781
1782             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1783             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1784             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1785             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1786             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1787             _MM_TRANSPOSE4_PS(Y,F,G,H);
1788             Heps             = _mm_mul_ps(vfeps,H);
1789             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1790             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1791             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
1792
1793             fscal            = felec;
1794
1795             /* Calculate temporary vectorial force */
1796             tx               = _mm_mul_ps(fscal,dx22);
1797             ty               = _mm_mul_ps(fscal,dy22);
1798             tz               = _mm_mul_ps(fscal,dz22);
1799
1800             /* Update vectorial force */
1801             fix2             = _mm_add_ps(fix2,tx);
1802             fiy2             = _mm_add_ps(fiy2,ty);
1803             fiz2             = _mm_add_ps(fiz2,tz);
1804
1805             fjx2             = _mm_add_ps(fjx2,tx);
1806             fjy2             = _mm_add_ps(fjy2,ty);
1807             fjz2             = _mm_add_ps(fjz2,tz);
1808             
1809             fjptrA             = f+j_coord_offsetA;
1810             fjptrB             = f+j_coord_offsetB;
1811             fjptrC             = f+j_coord_offsetC;
1812             fjptrD             = f+j_coord_offsetD;
1813
1814             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
1815                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1816
1817             /* Inner loop uses 351 flops */
1818         }
1819
1820         if(jidx<j_index_end)
1821         {
1822
1823             /* Get j neighbor index, and coordinate index */
1824             jnrlistA         = jjnr[jidx];
1825             jnrlistB         = jjnr[jidx+1];
1826             jnrlistC         = jjnr[jidx+2];
1827             jnrlistD         = jjnr[jidx+3];
1828             /* Sign of each element will be negative for non-real atoms.
1829              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1830              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1831              */
1832             dummy_mask = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1833             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1834             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1835             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1836             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1837             j_coord_offsetA  = DIM*jnrA;
1838             j_coord_offsetB  = DIM*jnrB;
1839             j_coord_offsetC  = DIM*jnrC;
1840             j_coord_offsetD  = DIM*jnrD;
1841
1842             /* load j atom coordinates */
1843             gmx_mm_load_3rvec_4ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1844                                               x+j_coord_offsetC,x+j_coord_offsetD,
1845                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1846
1847             /* Calculate displacement vector */
1848             dx00             = _mm_sub_ps(ix0,jx0);
1849             dy00             = _mm_sub_ps(iy0,jy0);
1850             dz00             = _mm_sub_ps(iz0,jz0);
1851             dx01             = _mm_sub_ps(ix0,jx1);
1852             dy01             = _mm_sub_ps(iy0,jy1);
1853             dz01             = _mm_sub_ps(iz0,jz1);
1854             dx02             = _mm_sub_ps(ix0,jx2);
1855             dy02             = _mm_sub_ps(iy0,jy2);
1856             dz02             = _mm_sub_ps(iz0,jz2);
1857             dx10             = _mm_sub_ps(ix1,jx0);
1858             dy10             = _mm_sub_ps(iy1,jy0);
1859             dz10             = _mm_sub_ps(iz1,jz0);
1860             dx11             = _mm_sub_ps(ix1,jx1);
1861             dy11             = _mm_sub_ps(iy1,jy1);
1862             dz11             = _mm_sub_ps(iz1,jz1);
1863             dx12             = _mm_sub_ps(ix1,jx2);
1864             dy12             = _mm_sub_ps(iy1,jy2);
1865             dz12             = _mm_sub_ps(iz1,jz2);
1866             dx20             = _mm_sub_ps(ix2,jx0);
1867             dy20             = _mm_sub_ps(iy2,jy0);
1868             dz20             = _mm_sub_ps(iz2,jz0);
1869             dx21             = _mm_sub_ps(ix2,jx1);
1870             dy21             = _mm_sub_ps(iy2,jy1);
1871             dz21             = _mm_sub_ps(iz2,jz1);
1872             dx22             = _mm_sub_ps(ix2,jx2);
1873             dy22             = _mm_sub_ps(iy2,jy2);
1874             dz22             = _mm_sub_ps(iz2,jz2);
1875
1876             /* Calculate squared distance and things based on it */
1877             rsq00            = gmx_mm_calc_rsq_ps(dx00,dy00,dz00);
1878             rsq01            = gmx_mm_calc_rsq_ps(dx01,dy01,dz01);
1879             rsq02            = gmx_mm_calc_rsq_ps(dx02,dy02,dz02);
1880             rsq10            = gmx_mm_calc_rsq_ps(dx10,dy10,dz10);
1881             rsq11            = gmx_mm_calc_rsq_ps(dx11,dy11,dz11);
1882             rsq12            = gmx_mm_calc_rsq_ps(dx12,dy12,dz12);
1883             rsq20            = gmx_mm_calc_rsq_ps(dx20,dy20,dz20);
1884             rsq21            = gmx_mm_calc_rsq_ps(dx21,dy21,dz21);
1885             rsq22            = gmx_mm_calc_rsq_ps(dx22,dy22,dz22);
1886
1887             rinv00           = gmx_mm_invsqrt_ps(rsq00);
1888             rinv01           = gmx_mm_invsqrt_ps(rsq01);
1889             rinv02           = gmx_mm_invsqrt_ps(rsq02);
1890             rinv10           = gmx_mm_invsqrt_ps(rsq10);
1891             rinv11           = gmx_mm_invsqrt_ps(rsq11);
1892             rinv12           = gmx_mm_invsqrt_ps(rsq12);
1893             rinv20           = gmx_mm_invsqrt_ps(rsq20);
1894             rinv21           = gmx_mm_invsqrt_ps(rsq21);
1895             rinv22           = gmx_mm_invsqrt_ps(rsq22);
1896
1897             fjx0             = _mm_setzero_ps();
1898             fjy0             = _mm_setzero_ps();
1899             fjz0             = _mm_setzero_ps();
1900             fjx1             = _mm_setzero_ps();
1901             fjy1             = _mm_setzero_ps();
1902             fjz1             = _mm_setzero_ps();
1903             fjx2             = _mm_setzero_ps();
1904             fjy2             = _mm_setzero_ps();
1905             fjz2             = _mm_setzero_ps();
1906
1907             /**************************
1908              * CALCULATE INTERACTIONS *
1909              **************************/
1910
1911             r00              = _mm_mul_ps(rsq00,rinv00);
1912             r00              = _mm_andnot_ps(dummy_mask,r00);
1913
1914             /* Calculate table index by multiplying r with table scale and truncate to integer */
1915             rt               = _mm_mul_ps(r00,vftabscale);
1916             vfitab           = _mm_cvttps_epi32(rt);
1917             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1918             vfitab           = _mm_slli_epi32(vfitab,2);
1919
1920             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1921             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1922             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1923             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1924             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1925             _MM_TRANSPOSE4_PS(Y,F,G,H);
1926             Heps             = _mm_mul_ps(vfeps,H);
1927             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1928             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1929             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq00,FF),_mm_mul_ps(vftabscale,rinv00)));
1930
1931             fscal            = felec;
1932
1933             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1934
1935             /* Calculate temporary vectorial force */
1936             tx               = _mm_mul_ps(fscal,dx00);
1937             ty               = _mm_mul_ps(fscal,dy00);
1938             tz               = _mm_mul_ps(fscal,dz00);
1939
1940             /* Update vectorial force */
1941             fix0             = _mm_add_ps(fix0,tx);
1942             fiy0             = _mm_add_ps(fiy0,ty);
1943             fiz0             = _mm_add_ps(fiz0,tz);
1944
1945             fjx0             = _mm_add_ps(fjx0,tx);
1946             fjy0             = _mm_add_ps(fjy0,ty);
1947             fjz0             = _mm_add_ps(fjz0,tz);
1948             
1949             /**************************
1950              * CALCULATE INTERACTIONS *
1951              **************************/
1952
1953             r01              = _mm_mul_ps(rsq01,rinv01);
1954             r01              = _mm_andnot_ps(dummy_mask,r01);
1955
1956             /* Calculate table index by multiplying r with table scale and truncate to integer */
1957             rt               = _mm_mul_ps(r01,vftabscale);
1958             vfitab           = _mm_cvttps_epi32(rt);
1959             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
1960             vfitab           = _mm_slli_epi32(vfitab,2);
1961
1962             /* CUBIC SPLINE TABLE ELECTROSTATICS */
1963             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
1964             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
1965             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
1966             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
1967             _MM_TRANSPOSE4_PS(Y,F,G,H);
1968             Heps             = _mm_mul_ps(vfeps,H);
1969             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
1970             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
1971             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq01,FF),_mm_mul_ps(vftabscale,rinv01)));
1972
1973             fscal            = felec;
1974
1975             fscal            = _mm_andnot_ps(dummy_mask,fscal);
1976
1977             /* Calculate temporary vectorial force */
1978             tx               = _mm_mul_ps(fscal,dx01);
1979             ty               = _mm_mul_ps(fscal,dy01);
1980             tz               = _mm_mul_ps(fscal,dz01);
1981
1982             /* Update vectorial force */
1983             fix0             = _mm_add_ps(fix0,tx);
1984             fiy0             = _mm_add_ps(fiy0,ty);
1985             fiz0             = _mm_add_ps(fiz0,tz);
1986
1987             fjx1             = _mm_add_ps(fjx1,tx);
1988             fjy1             = _mm_add_ps(fjy1,ty);
1989             fjz1             = _mm_add_ps(fjz1,tz);
1990             
1991             /**************************
1992              * CALCULATE INTERACTIONS *
1993              **************************/
1994
1995             r02              = _mm_mul_ps(rsq02,rinv02);
1996             r02              = _mm_andnot_ps(dummy_mask,r02);
1997
1998             /* Calculate table index by multiplying r with table scale and truncate to integer */
1999             rt               = _mm_mul_ps(r02,vftabscale);
2000             vfitab           = _mm_cvttps_epi32(rt);
2001             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2002             vfitab           = _mm_slli_epi32(vfitab,2);
2003
2004             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2005             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2006             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2007             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2008             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2009             _MM_TRANSPOSE4_PS(Y,F,G,H);
2010             Heps             = _mm_mul_ps(vfeps,H);
2011             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2012             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2013             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq02,FF),_mm_mul_ps(vftabscale,rinv02)));
2014
2015             fscal            = felec;
2016
2017             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2018
2019             /* Calculate temporary vectorial force */
2020             tx               = _mm_mul_ps(fscal,dx02);
2021             ty               = _mm_mul_ps(fscal,dy02);
2022             tz               = _mm_mul_ps(fscal,dz02);
2023
2024             /* Update vectorial force */
2025             fix0             = _mm_add_ps(fix0,tx);
2026             fiy0             = _mm_add_ps(fiy0,ty);
2027             fiz0             = _mm_add_ps(fiz0,tz);
2028
2029             fjx2             = _mm_add_ps(fjx2,tx);
2030             fjy2             = _mm_add_ps(fjy2,ty);
2031             fjz2             = _mm_add_ps(fjz2,tz);
2032             
2033             /**************************
2034              * CALCULATE INTERACTIONS *
2035              **************************/
2036
2037             r10              = _mm_mul_ps(rsq10,rinv10);
2038             r10              = _mm_andnot_ps(dummy_mask,r10);
2039
2040             /* Calculate table index by multiplying r with table scale and truncate to integer */
2041             rt               = _mm_mul_ps(r10,vftabscale);
2042             vfitab           = _mm_cvttps_epi32(rt);
2043             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2044             vfitab           = _mm_slli_epi32(vfitab,2);
2045
2046             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2047             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2048             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2049             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2050             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2051             _MM_TRANSPOSE4_PS(Y,F,G,H);
2052             Heps             = _mm_mul_ps(vfeps,H);
2053             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2054             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2055             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq10,FF),_mm_mul_ps(vftabscale,rinv10)));
2056
2057             fscal            = felec;
2058
2059             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2060
2061             /* Calculate temporary vectorial force */
2062             tx               = _mm_mul_ps(fscal,dx10);
2063             ty               = _mm_mul_ps(fscal,dy10);
2064             tz               = _mm_mul_ps(fscal,dz10);
2065
2066             /* Update vectorial force */
2067             fix1             = _mm_add_ps(fix1,tx);
2068             fiy1             = _mm_add_ps(fiy1,ty);
2069             fiz1             = _mm_add_ps(fiz1,tz);
2070
2071             fjx0             = _mm_add_ps(fjx0,tx);
2072             fjy0             = _mm_add_ps(fjy0,ty);
2073             fjz0             = _mm_add_ps(fjz0,tz);
2074             
2075             /**************************
2076              * CALCULATE INTERACTIONS *
2077              **************************/
2078
2079             r11              = _mm_mul_ps(rsq11,rinv11);
2080             r11              = _mm_andnot_ps(dummy_mask,r11);
2081
2082             /* Calculate table index by multiplying r with table scale and truncate to integer */
2083             rt               = _mm_mul_ps(r11,vftabscale);
2084             vfitab           = _mm_cvttps_epi32(rt);
2085             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2086             vfitab           = _mm_slli_epi32(vfitab,2);
2087
2088             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2089             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2090             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2091             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2092             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2093             _MM_TRANSPOSE4_PS(Y,F,G,H);
2094             Heps             = _mm_mul_ps(vfeps,H);
2095             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2096             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2097             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq11,FF),_mm_mul_ps(vftabscale,rinv11)));
2098
2099             fscal            = felec;
2100
2101             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2102
2103             /* Calculate temporary vectorial force */
2104             tx               = _mm_mul_ps(fscal,dx11);
2105             ty               = _mm_mul_ps(fscal,dy11);
2106             tz               = _mm_mul_ps(fscal,dz11);
2107
2108             /* Update vectorial force */
2109             fix1             = _mm_add_ps(fix1,tx);
2110             fiy1             = _mm_add_ps(fiy1,ty);
2111             fiz1             = _mm_add_ps(fiz1,tz);
2112
2113             fjx1             = _mm_add_ps(fjx1,tx);
2114             fjy1             = _mm_add_ps(fjy1,ty);
2115             fjz1             = _mm_add_ps(fjz1,tz);
2116             
2117             /**************************
2118              * CALCULATE INTERACTIONS *
2119              **************************/
2120
2121             r12              = _mm_mul_ps(rsq12,rinv12);
2122             r12              = _mm_andnot_ps(dummy_mask,r12);
2123
2124             /* Calculate table index by multiplying r with table scale and truncate to integer */
2125             rt               = _mm_mul_ps(r12,vftabscale);
2126             vfitab           = _mm_cvttps_epi32(rt);
2127             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2128             vfitab           = _mm_slli_epi32(vfitab,2);
2129
2130             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2131             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2132             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2133             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2134             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2135             _MM_TRANSPOSE4_PS(Y,F,G,H);
2136             Heps             = _mm_mul_ps(vfeps,H);
2137             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2138             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2139             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq12,FF),_mm_mul_ps(vftabscale,rinv12)));
2140
2141             fscal            = felec;
2142
2143             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2144
2145             /* Calculate temporary vectorial force */
2146             tx               = _mm_mul_ps(fscal,dx12);
2147             ty               = _mm_mul_ps(fscal,dy12);
2148             tz               = _mm_mul_ps(fscal,dz12);
2149
2150             /* Update vectorial force */
2151             fix1             = _mm_add_ps(fix1,tx);
2152             fiy1             = _mm_add_ps(fiy1,ty);
2153             fiz1             = _mm_add_ps(fiz1,tz);
2154
2155             fjx2             = _mm_add_ps(fjx2,tx);
2156             fjy2             = _mm_add_ps(fjy2,ty);
2157             fjz2             = _mm_add_ps(fjz2,tz);
2158             
2159             /**************************
2160              * CALCULATE INTERACTIONS *
2161              **************************/
2162
2163             r20              = _mm_mul_ps(rsq20,rinv20);
2164             r20              = _mm_andnot_ps(dummy_mask,r20);
2165
2166             /* Calculate table index by multiplying r with table scale and truncate to integer */
2167             rt               = _mm_mul_ps(r20,vftabscale);
2168             vfitab           = _mm_cvttps_epi32(rt);
2169             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2170             vfitab           = _mm_slli_epi32(vfitab,2);
2171
2172             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2173             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2174             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2175             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2176             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2177             _MM_TRANSPOSE4_PS(Y,F,G,H);
2178             Heps             = _mm_mul_ps(vfeps,H);
2179             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2180             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2181             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq20,FF),_mm_mul_ps(vftabscale,rinv20)));
2182
2183             fscal            = felec;
2184
2185             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2186
2187             /* Calculate temporary vectorial force */
2188             tx               = _mm_mul_ps(fscal,dx20);
2189             ty               = _mm_mul_ps(fscal,dy20);
2190             tz               = _mm_mul_ps(fscal,dz20);
2191
2192             /* Update vectorial force */
2193             fix2             = _mm_add_ps(fix2,tx);
2194             fiy2             = _mm_add_ps(fiy2,ty);
2195             fiz2             = _mm_add_ps(fiz2,tz);
2196
2197             fjx0             = _mm_add_ps(fjx0,tx);
2198             fjy0             = _mm_add_ps(fjy0,ty);
2199             fjz0             = _mm_add_ps(fjz0,tz);
2200             
2201             /**************************
2202              * CALCULATE INTERACTIONS *
2203              **************************/
2204
2205             r21              = _mm_mul_ps(rsq21,rinv21);
2206             r21              = _mm_andnot_ps(dummy_mask,r21);
2207
2208             /* Calculate table index by multiplying r with table scale and truncate to integer */
2209             rt               = _mm_mul_ps(r21,vftabscale);
2210             vfitab           = _mm_cvttps_epi32(rt);
2211             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2212             vfitab           = _mm_slli_epi32(vfitab,2);
2213
2214             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2215             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2216             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2217             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2218             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2219             _MM_TRANSPOSE4_PS(Y,F,G,H);
2220             Heps             = _mm_mul_ps(vfeps,H);
2221             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2222             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2223             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq21,FF),_mm_mul_ps(vftabscale,rinv21)));
2224
2225             fscal            = felec;
2226
2227             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2228
2229             /* Calculate temporary vectorial force */
2230             tx               = _mm_mul_ps(fscal,dx21);
2231             ty               = _mm_mul_ps(fscal,dy21);
2232             tz               = _mm_mul_ps(fscal,dz21);
2233
2234             /* Update vectorial force */
2235             fix2             = _mm_add_ps(fix2,tx);
2236             fiy2             = _mm_add_ps(fiy2,ty);
2237             fiz2             = _mm_add_ps(fiz2,tz);
2238
2239             fjx1             = _mm_add_ps(fjx1,tx);
2240             fjy1             = _mm_add_ps(fjy1,ty);
2241             fjz1             = _mm_add_ps(fjz1,tz);
2242             
2243             /**************************
2244              * CALCULATE INTERACTIONS *
2245              **************************/
2246
2247             r22              = _mm_mul_ps(rsq22,rinv22);
2248             r22              = _mm_andnot_ps(dummy_mask,r22);
2249
2250             /* Calculate table index by multiplying r with table scale and truncate to integer */
2251             rt               = _mm_mul_ps(r22,vftabscale);
2252             vfitab           = _mm_cvttps_epi32(rt);
2253             vfeps            = _mm_sub_ps(rt,_mm_cvtepi32_ps(vfitab));
2254             vfitab           = _mm_slli_epi32(vfitab,2);
2255
2256             /* CUBIC SPLINE TABLE ELECTROSTATICS */
2257             Y                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,0) );
2258             F                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,1) );
2259             G                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,2) );
2260             H                = _mm_load_ps( vftab + gmx_mm_extract_epi32(vfitab,3) );
2261             _MM_TRANSPOSE4_PS(Y,F,G,H);
2262             Heps             = _mm_mul_ps(vfeps,H);
2263             Fp               = _mm_add_ps(F,_mm_mul_ps(vfeps,_mm_add_ps(G,Heps)));
2264             FF               = _mm_add_ps(Fp,_mm_mul_ps(vfeps,_mm_add_ps(G,_mm_add_ps(Heps,Heps))));
2265             felec            = _mm_xor_ps(signbit,_mm_mul_ps(_mm_mul_ps(qq22,FF),_mm_mul_ps(vftabscale,rinv22)));
2266
2267             fscal            = felec;
2268
2269             fscal            = _mm_andnot_ps(dummy_mask,fscal);
2270
2271             /* Calculate temporary vectorial force */
2272             tx               = _mm_mul_ps(fscal,dx22);
2273             ty               = _mm_mul_ps(fscal,dy22);
2274             tz               = _mm_mul_ps(fscal,dz22);
2275
2276             /* Update vectorial force */
2277             fix2             = _mm_add_ps(fix2,tx);
2278             fiy2             = _mm_add_ps(fiy2,ty);
2279             fiz2             = _mm_add_ps(fiz2,tz);
2280
2281             fjx2             = _mm_add_ps(fjx2,tx);
2282             fjy2             = _mm_add_ps(fjy2,ty);
2283             fjz2             = _mm_add_ps(fjz2,tz);
2284             
2285             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2286             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2287             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2288             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2289
2290             gmx_mm_decrement_3rvec_4ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,
2291                                                    fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2292
2293             /* Inner loop uses 360 flops */
2294         }
2295
2296         /* End of innermost loop */
2297
2298         gmx_mm_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2299                                               f+i_coord_offset,fshift+i_shift_offset);
2300
2301         /* Increment number of inner iterations */
2302         inneriter                  += j_index_end - j_index_start;
2303
2304         /* Outer loop uses 18 flops */
2305     }
2306
2307     /* Increment number of outer iterations */
2308     outeriter        += nri;
2309
2310     /* Update outer/inner flops */
2311
2312     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*360);
2313 }